AI-Digital-Library-Assistant / services /podcast_generator_service.py
Nihal2000's picture
Initial deployment of AI Digital Library Assistant
86aa5e4
raw
history blame
23.6 kB
import logging
import asyncio
import json
import uuid
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path
import re
try:
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
ELEVENLABS_AVAILABLE = True
except ImportError:
ELEVENLABS_AVAILABLE = False
import config
from services.llamaindex_service import LlamaIndexService
from services.llm_service import LLMService
logger = logging.getLogger(__name__)
@dataclass
class DocumentAnalysis:
"""Analysis results from document(s)"""
key_insights: List[str] # 5-7 main points
topics: List[str]
complexity_level: str # beginner, intermediate, advanced
estimated_words: int
source_documents: List[str]
summary: str
@dataclass
class DialogueLine:
"""Single line of podcast dialogue"""
speaker: str # "HOST1" or "HOST2"
text: str
pause_after: float = 0.5 # seconds
@dataclass
class PodcastScript:
"""Complete podcast script"""
dialogue: List[DialogueLine]
total_duration_estimate: float
word_count: int
style: str
def to_text(self) -> str:
"""Convert to readable transcript"""
lines = []
for line in self.dialogue:
lines.append(f"{line.speaker}: {line.text}")
return "\n\n".join(lines)
@dataclass
class PodcastMetadata:
"""Metadata for generated podcast"""
podcast_id: str
title: str
description: str
source_documents: List[str]
style: str
duration_seconds: float
file_size_mb: float
voices: Dict[str, str]
generated_at: str
generation_cost: Dict[str, float]
key_topics: List[str]
@dataclass
class PodcastResult:
"""Complete podcast generation result"""
podcast_id: str
audio_file_path: str
transcript: str
metadata: PodcastMetadata
generation_time: float
success: bool
error: Optional[str] = None
class PodcastGeneratorService:
"""
Service for generating conversational podcasts from documents.
Combines LlamaIndex for analysis and ElevenLabs for voice synthesis.
"""
# Word count per minute for podcast pacing
WORDS_PER_MINUTE = 150
# Script generation prompts for different styles
SCRIPT_PROMPTS = {
"conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents.
CONTEXT:
{analysis}
REQUIREMENTS:
- Duration: {duration_minutes} minutes (approximately {word_count} words)
- Style: Conversational, friendly, and accessible
- Format: Alternating dialogue between HOST1 and HOST2
- Include natural transitions, questions, and "aha!" moments
- Make complex topics easy to understand
- Add enthusiasm and genuine curiosity
- Balance speaking time between both hosts
DIALOGUE FORMAT (strictly follow):
HOST1: [What they say]
HOST2: [What they say]
STRUCTURE:
1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact
2. Introduction (1 minute): Set context and preview what's coming
3. Main Discussion (70% of time): Deep dive into key insights
4. Wrap-up (1 minute): Summarize key takeaways and final thoughts
TONE: Friendly, enthusiastic, educational but not condescending
Generate the complete podcast script now:""",
"educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner.
CONTEXT:
{analysis}
REQUIREMENTS:
- Duration: {duration_minutes} minutes (approximately {word_count} words)
- Style: Clear, methodical, educational
- HOST1 acts as the teacher/expert, HOST2 as the curious learner
- Include explanations of complex concepts
- Use examples and analogies
- Build knowledge progressively
DIALOGUE FORMAT:
HOST1: [Expert explanation]
HOST2: [Clarifying question or observation]
Generate the complete educational podcast script now:""",
"technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth.
CONTEXT:
{analysis}
REQUIREMENTS:
- Duration: {duration_minutes} minutes (approximately {word_count} words)
- Style: Professional, detailed, technically accurate
- HOST1 is the subject matter expert, HOST2 is an informed interviewer
- Use proper technical terminology
- Dive into implementation details
- Discuss implications and applications
DIALOGUE FORMAT:
HOST1: [Technical insight]
HOST2: [Probing question]
Generate the complete technical podcast script now:""",
"casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents.
CONTEXT:
{analysis}
REQUIREMENTS:
- Duration: {duration_minutes} minutes (approximately {word_count} words)
- Style: Relaxed, humorous, energetic
- Both hosts are enthusiastic and engaged
- Use casual language and occasional humor
- Make it entertaining while staying informative
- Quick pacing with energy
DIALOGUE FORMAT:
HOST1: [Casual commentary]
HOST2: [Enthusiastic response]
Generate the complete casual podcast script now:"""
}
def __init__(
self,
llamaindex_service: LlamaIndexService,
llm_service: LLMService,
elevenlabs_api_key: Optional[str] = None
):
"""
Initialize podcast generator service
Args:
llamaindex_service: Service for document analysis
llm_service: Service for script generation
elevenlabs_api_key: ElevenLabs API key (uses config if not provided)
"""
self.config = config.config
self.llamaindex_service = llamaindex_service
self.llm_service = llm_service
# Initialize ElevenLabs client
self.elevenlabs_client = None
if ELEVENLABS_AVAILABLE:
api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
if api_key:
try:
self.elevenlabs_client = ElevenLabs(api_key=api_key)
logger.info("ElevenLabs client initialized for podcast generation")
except Exception as e:
logger.error(f"Failed to initialize ElevenLabs client: {e}")
# Create podcast storage directory
self.podcast_dir = Path("./data/podcasts")
self.podcast_dir.mkdir(parents=True, exist_ok=True)
# Metadata database file
self.metadata_file = self.podcast_dir / "metadata_db.json"
self._ensure_metadata_db()
def _ensure_metadata_db(self):
"""Ensure metadata database exists"""
if not self.metadata_file.exists():
self.metadata_file.write_text(json.dumps([], indent=2))
async def generate_podcast(
self,
document_ids: List[str],
style: str = "conversational",
duration_minutes: int = 10,
host1_voice: str = "Rachel",
host2_voice: str = "Adam"
) -> PodcastResult:
"""
Generate a complete podcast from documents
Args:
document_ids: List of document IDs to analyze
style: Podcast style (conversational, educational, technical, casual)
duration_minutes: Target duration in minutes
host1_voice: Voice name for first host
host2_voice: Voice name for second host
Returns:
PodcastResult with audio file path and metadata
"""
start_time = datetime.now()
podcast_id = str(uuid.uuid4())
try:
logger.info(f"Starting podcast generation {podcast_id}")
logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")
# Step 1: Analyze documents
logger.info("Step 1: Analyzing documents...")
analysis = await self.analyze_documents(document_ids)
# Step 2: Generate script
logger.info("Step 2: Generating podcast script...")
script = await self.generate_script(analysis, style, duration_minutes)
# Step 3: Synthesize audio
logger.info("Step 3: Synthesizing audio with voices...")
audio_file_path = await self.synthesize_audio(
podcast_id,
script,
host1_voice,
host2_voice
)
# Calculate generation time
generation_time = (datetime.now() - start_time).total_seconds()
# Step 4: Create metadata
logger.info("Step 4: Creating metadata...")
metadata = self._create_metadata(
podcast_id,
analysis,
script,
audio_file_path,
{host1_voice, host2_voice},
document_ids,
style
)
# Save metadata
self._save_metadata(metadata)
# Save transcript
transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
transcript_path.write_text(script.to_text(), encoding="utf-8")
logger.info(f"Podcast generated successfully: {podcast_id}")
return PodcastResult(
podcast_id=podcast_id,
audio_file_path=str(audio_file_path),
transcript=script.to_text(),
metadata=metadata,
generation_time=generation_time,
success=True
)
except Exception as e:
logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
return PodcastResult(
podcast_id=podcast_id,
audio_file_path="",
transcript="",
metadata=None,
generation_time=(datetime.now() - start_time).total_seconds(),
success=False,
error=str(e)
)
async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
"""
Analyze documents to extract key insights for podcast
Args:
document_ids: List of document IDs
Returns:
DocumentAnalysis with key insights and topics
"""
# Create analysis query for the agentic RAG
analysis_query = f"""Analyze the following documents and provide:
1. The 5-7 most important insights or key points
2. Main themes and topics covered
3. The overall complexity level (beginner/intermediate/advanced)
4. A brief summary suitable for podcast discussion
Document IDs: {', '.join(document_ids)}
Provide a structured analysis optimized for creating an engaging podcast discussion."""
# Use LlamaIndex agentic RAG for analysis
result = await self.llamaindex_service.query(analysis_query)
# Parse the result to extract structured information
# This is a simplified parser - in production, you might want more robust parsing
insights = self._extract_insights(result)
topics = self._extract_topics(result)
complexity = self._determine_complexity(result)
return DocumentAnalysis(
key_insights=insights[:7], # Limit to 7
topics=topics,
complexity_level=complexity,
estimated_words=len(result.split()),
source_documents=document_ids,
summary=result
)
def _extract_insights(self, text: str) -> List[str]:
"""Extract key insights from analysis text"""
insights = []
#Simple extraction based on numbered lists or bullet points
lines = text.split('\n')
for line in lines:
line = line.strip()
# Match patterns like "1.", "2.", "-", "*", "•"
if re.match(r'^\d+\.|\-|\*|•', line):
insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip()
if len(insight) > 20: # Ensure it's substantial
insights.append(insight)
# If no insights found, create from first few sentences
if not insights:
sentences = text.split('.')
insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]
return insights
def _extract_topics(self, text: str) -> List[str]:
"""Extract main topics from analysis"""
# Simple keyword extraction - could be enhanced with NLP
common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
words = text.lower().split()
word_freq = {}
for word in words:
word = re.sub(r'[^\w\s]', '', word)
if len(word) > 4 and word not in common_words:
word_freq[word] = word_freq.get(word, 0) + 1
# Get top topics
topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
return [topic[0].title() for topic in topics]
def _determine_complexity(self, text: str) -> str:
"""Determine content complexity level"""
text_lower = text.lower()
# Simple heuristic based on keywords
if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
return "beginner"
elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
return "advanced"
else:
return "intermediate"
async def generate_script(
self,
analysis: DocumentAnalysis,
style: str,
duration_minutes: int
) -> PodcastScript:
"""
Generate podcast script from analysis
Args:
analysis: Document analysis results
style: Podcast style
duration_minutes: Target duration
Returns:
Complete podcast script
"""
# Calculate target word count
target_words = duration_minutes * self.WORDS_PER_MINUTE
# Prepare analysis context
analysis_context = f"""
KEY INSIGHTS:
{chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))}
TOPICS: {', '.join(analysis.topics)}
COMPLEXITY: {analysis.complexity_level}
SUMMARY:
{analysis.summary[:500]}...
"""
# Get prompt template for style
prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])
# Fill in the template
prompt = prompt_template.format(
analysis=analysis_context,
duration_minutes=duration_minutes,
word_count=target_words
)
# Generate script using LLM
script_text = await self.llm_service.generate_text(
prompt,
max_tokens=target_words * 2, # Give room for generation
temperature=0.8 # More creative
)
# Parse script into dialogue lines
dialogue = self._parse_script(script_text)
# Calculate actual word count and duration
word_count = sum(len(line.text.split()) for line in dialogue)
duration_estimate = word_count / self.WORDS_PER_MINUTE
return PodcastScript(
dialogue=dialogue,
total_duration_estimate=duration_estimate * 60, # Convert to seconds
word_count=word_count,
style=style
)
def _parse_script(self, script_text: str) -> List[DialogueLine]:
"""Parse generated script into dialogue lines"""
dialogue = []
lines = script_text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# Match "HOST1:" or "HOST2:" format
if line.startswith('HOST1:'):
text = line[6:].strip()
if text:
dialogue.append(DialogueLine(speaker="HOST1", text=text))
elif line.startswith('HOST2:'):
text = line[6:].strip()
if text:
dialogue.append(DialogueLine(speaker="HOST2", text=text))
return dialogue
def _get_voice_id(self, voice_name: str) -> str:
"""
Get voice ID from voice name.
Falls back to first available voice if not found.
Args:
voice_name: Voice name (e.g., "Rachel", "Adam")
Returns:
Voice ID string
"""
try:
# Try to get voices and find by name
voices = self.elevenlabs_client.voices.get_all()
if not voices or not voices.voices:
logger.error("No voices available from ElevenLabs")
raise RuntimeError("No voices available")
# First, try exact name match
for voice in voices.voices:
if voice.name.lower() == voice_name.lower():
logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}")
return voice.voice_id
# Try partial match
for voice in voices.voices:
if voice_name.lower() in voice.name.lower():
logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})")
return voice.voice_id
# Use first available voice as fallback
first_voice = voices.voices[0]
logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})")
return first_voice.voice_id
except Exception as e:
logger.error(f"Could not fetch voices: {e}", exc_info=True)
raise RuntimeError(f"Failed to get voice ID: {str(e)}")
async def synthesize_audio(
self,
podcast_id: str,
script: PodcastScript,
host1_voice: str,
host2_voice: str
) -> Path:
"""
Synthesize audio from script using ElevenLabs
Args:
podcast_id: Unique podcast ID
script: Podcast script
host1_voice: Voice for HOST1
host2_voice: Voice for HOST2
Returns:
Path to generated MP3 file
"""
if not self.elevenlabs_client:
raise RuntimeError("ElevenLabs client not initialized")
audio_file = self.podcast_dir / f"{podcast_id}.mp3"
# For now, create a simple text-to-speech for the full script
# In production, you'd combine segments with pauses
full_text = script.to_text()
# Get actual voice ID
voice_id = self._get_voice_id(host1_voice)
try:
# Use modern ElevenLabs TTS API
# Note: This is a simplified version using single voice
# Full implementation would process each line separately with different voices
logger.info(f"Generating audio with voice: {host1_voice}")
# Use the modern text_to_speech API
audio_generator = self.elevenlabs_client.text_to_speech.convert(
voice_id=voice_id, # Using resolved voice ID
text=full_text,
model_id="eleven_multilingual_v2"
)
# Write audio chunks to file
with open(audio_file, 'wb') as f:
for chunk in audio_generator:
if chunk:
f.write(chunk)
# Verify file was created with content
if audio_file.exists() and audio_file.stat().st_size > 1000:
logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)")
return audio_file
else:
raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes")
except Exception as e:
logger.error(f"Audio synthesis failed: {e}", exc_info=True)
raise RuntimeError(f"Failed to generate podcast audio: {str(e)}")
def _create_metadata(
self,
podcast_id: str,
analysis: DocumentAnalysis,
script: PodcastScript,
audio_path: Path,
voices: set,
document_ids: List[str],
style: str
) -> PodcastMetadata:
"""Create podcast metadata"""
# Auto-generate title
title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"
# Create description
description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)."
# Calculate file size
file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0
# Estimate costs
llm_cost = (script.word_count / 1000) * 0.01 # Rough estimate
tts_cost = (script.word_count * 5 / 1000) * 0.30 # Rough estimate
return PodcastMetadata(
podcast_id=podcast_id,
title=title,
description=description,
source_documents=document_ids,
style=style,
duration_seconds=script.total_duration_estimate,
file_size_mb=file_size_mb,
voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
"host2": list(voices)[1] if len(voices) > 1 else "Adam"},
generated_at=datetime.now().isoformat(),
generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
key_topics=analysis.topics
)
def _save_metadata(self, metadata: PodcastMetadata):
"""Save metadata to database"""
try:
# Load existing metadata
existing = json.loads(self.metadata_file.read_text())
# Add new metadata
existing.append(asdict(metadata))
# Save back
self.metadata_file.write_text(json.dumps(existing, indent=2))
logger.info(f"Metadata saved for podcast: {metadata.podcast_id}")
except Exception as e:
logger.error(f"Failed to save metadata: {e}")
def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
"""List generated podcasts"""
try:
data = json.loads(self.metadata_file.read_text())
podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
return list(reversed(podcasts)) # Most recent first
except Exception as e:
logger.error(f"Failed to list podcasts: {e}")
return []
def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
"""Get specific podcast metadata"""
try:
data = json.loads(self.metadata_file.read_text())
for item in data:
if item.get('podcast_id') == podcast_id:
return PodcastMetadata(**item)
return None
except Exception as e:
logger.error(f"Failed to get podcast: {e}")
return None