import logging import asyncio import json import uuid from typing import List, Dict, Any, Optional from dataclasses import dataclass, asdict from datetime import datetime from pathlib import Path import re try: from elevenlabs import VoiceSettings from elevenlabs.client import ElevenLabs ELEVENLABS_AVAILABLE = True except ImportError: ELEVENLABS_AVAILABLE = False import config from services.llamaindex_service import LlamaIndexService from services.llm_service import LLMService logger = logging.getLogger(__name__) @dataclass class DocumentAnalysis: """Analysis results from document(s)""" key_insights: List[str] # 5-7 main points topics: List[str] complexity_level: str # beginner, intermediate, advanced estimated_words: int source_documents: List[str] summary: str @dataclass class DialogueLine: """Single line of podcast dialogue""" speaker: str # "HOST1" or "HOST2" text: str pause_after: float = 0.5 # seconds @dataclass class PodcastScript: """Complete podcast script""" dialogue: List[DialogueLine] total_duration_estimate: float word_count: int style: str def to_text(self) -> str: """Convert to readable transcript""" lines = [] for line in self.dialogue: lines.append(f"{line.speaker}: {line.text}") return "\n\n".join(lines) @dataclass class PodcastMetadata: """Metadata for generated podcast""" podcast_id: str title: str description: str source_documents: List[str] style: str duration_seconds: float file_size_mb: float voices: Dict[str, str] generated_at: str generation_cost: Dict[str, float] key_topics: List[str] @dataclass class PodcastResult: """Complete podcast generation result""" podcast_id: str audio_file_path: str transcript: str metadata: PodcastMetadata generation_time: float success: bool error: Optional[str] = None class PodcastGeneratorService: """ Service for generating conversational podcasts from documents. Combines LlamaIndex for analysis and ElevenLabs for voice synthesis. """ # Word count per minute for podcast pacing WORDS_PER_MINUTE = 150 # Script generation prompts for different styles SCRIPT_PROMPTS = { "conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents. CONTEXT: {analysis} REQUIREMENTS: - Duration: {duration_minutes} minutes (approximately {word_count} words) - Style: Conversational, friendly, and accessible - Format: Alternating dialogue between HOST1 and HOST2 - Include natural transitions, questions, and "aha!" moments - Make complex topics easy to understand - Add enthusiasm and genuine curiosity - Balance speaking time between both hosts DIALOGUE FORMAT (strictly follow): HOST1: [What they say] HOST2: [What they say] STRUCTURE: 1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact 2. Introduction (1 minute): Set context and preview what's coming 3. Main Discussion (70% of time): Deep dive into key insights 4. Wrap-up (1 minute): Summarize key takeaways and final thoughts TONE: Friendly, enthusiastic, educational but not condescending Generate the complete podcast script now:""", "educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner. CONTEXT: {analysis} REQUIREMENTS: - Duration: {duration_minutes} minutes (approximately {word_count} words) - Style: Clear, methodical, educational - HOST1 acts as the teacher/expert, HOST2 as the curious learner - Include explanations of complex concepts - Use examples and analogies - Build knowledge progressively DIALOGUE FORMAT: HOST1: [Expert explanation] HOST2: [Clarifying question or observation] Generate the complete educational podcast script now:""", "technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth. CONTEXT: {analysis} REQUIREMENTS: - Duration: {duration_minutes} minutes (approximately {word_count} words) - Style: Professional, detailed, technically accurate - HOST1 is the subject matter expert, HOST2 is an informed interviewer - Use proper technical terminology - Dive into implementation details - Discuss implications and applications DIALOGUE FORMAT: HOST1: [Technical insight] HOST2: [Probing question] Generate the complete technical podcast script now:""", "casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents. CONTEXT: {analysis} REQUIREMENTS: - Duration: {duration_minutes} minutes (approximately {word_count} words) - Style: Relaxed, humorous, energetic - Both hosts are enthusiastic and engaged - Use casual language and occasional humor - Make it entertaining while staying informative - Quick pacing with energy DIALOGUE FORMAT: HOST1: [Casual commentary] HOST2: [Enthusiastic response] Generate the complete casual podcast script now:""" } def __init__( self, llamaindex_service: LlamaIndexService, llm_service: LLMService, elevenlabs_api_key: Optional[str] = None ): """ Initialize podcast generator service Args: llamaindex_service: Service for document analysis llm_service: Service for script generation elevenlabs_api_key: ElevenLabs API key (uses config if not provided) """ self.config = config.config self.llamaindex_service = llamaindex_service self.llm_service = llm_service # Initialize ElevenLabs client self.elevenlabs_client = None if ELEVENLABS_AVAILABLE: api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY if api_key: try: self.elevenlabs_client = ElevenLabs(api_key=api_key) logger.info("ElevenLabs client initialized for podcast generation") except Exception as e: logger.error(f"Failed to initialize ElevenLabs client: {e}") # Create podcast storage directory self.podcast_dir = Path("./data/podcasts") self.podcast_dir.mkdir(parents=True, exist_ok=True) # Metadata database file self.metadata_file = self.podcast_dir / "metadata_db.json" self._ensure_metadata_db() def _ensure_metadata_db(self): """Ensure metadata database exists""" if not self.metadata_file.exists(): self.metadata_file.write_text(json.dumps([], indent=2)) async def generate_podcast( self, document_ids: List[str], style: str = "conversational", duration_minutes: int = 10, host1_voice: str = "Rachel", host2_voice: str = "Adam" ) -> PodcastResult: """ Generate a complete podcast from documents Args: document_ids: List of document IDs to analyze style: Podcast style (conversational, educational, technical, casual) duration_minutes: Target duration in minutes host1_voice: Voice name for first host host2_voice: Voice name for second host Returns: PodcastResult with audio file path and metadata """ start_time = datetime.now() podcast_id = str(uuid.uuid4()) try: logger.info(f"Starting podcast generation {podcast_id}") logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min") # Step 1: Analyze documents logger.info("Step 1: Analyzing documents...") analysis = await self.analyze_documents(document_ids) # Step 2: Generate script logger.info("Step 2: Generating podcast script...") script = await self.generate_script(analysis, style, duration_minutes) # Step 3: Synthesize audio logger.info("Step 3: Synthesizing audio with voices...") audio_file_path = await self.synthesize_audio( podcast_id, script, host1_voice, host2_voice ) # Calculate generation time generation_time = (datetime.now() - start_time).total_seconds() # Step 4: Create metadata logger.info("Step 4: Creating metadata...") metadata = self._create_metadata( podcast_id, analysis, script, audio_file_path, {host1_voice, host2_voice}, document_ids, style ) # Save metadata self._save_metadata(metadata) # Save transcript transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt" transcript_path.write_text(script.to_text(), encoding="utf-8") logger.info(f"Podcast generated successfully: {podcast_id}") return PodcastResult( podcast_id=podcast_id, audio_file_path=str(audio_file_path), transcript=script.to_text(), metadata=metadata, generation_time=generation_time, success=True ) except Exception as e: logger.error(f"Podcast generation failed: {str(e)}", exc_info=True) return PodcastResult( podcast_id=podcast_id, audio_file_path="", transcript="", metadata=None, generation_time=(datetime.now() - start_time).total_seconds(), success=False, error=str(e) ) async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis: """ Analyze documents to extract key insights for podcast Args: document_ids: List of document IDs Returns: DocumentAnalysis with key insights and topics """ # Create analysis query for the agentic RAG analysis_query = f"""Analyze the following documents and provide: 1. The 5-7 most important insights or key points 2. Main themes and topics covered 3. The overall complexity level (beginner/intermediate/advanced) 4. A brief summary suitable for podcast discussion Document IDs: {', '.join(document_ids)} Provide a structured analysis optimized for creating an engaging podcast discussion.""" # Use LlamaIndex agentic RAG for analysis result = await self.llamaindex_service.query(analysis_query) # Parse the result to extract structured information # This is a simplified parser - in production, you might want more robust parsing insights = self._extract_insights(result) topics = self._extract_topics(result) complexity = self._determine_complexity(result) return DocumentAnalysis( key_insights=insights[:7], # Limit to 7 topics=topics, complexity_level=complexity, estimated_words=len(result.split()), source_documents=document_ids, summary=result ) def _extract_insights(self, text: str) -> List[str]: """Extract key insights from analysis text""" insights = [] #Simple extraction based on numbered lists or bullet points lines = text.split('\n') for line in lines: line = line.strip() # Match patterns like "1.", "2.", "-", "*", "•" if re.match(r'^\d+\.|\-|\*|•', line): insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip() if len(insight) > 20: # Ensure it's substantial insights.append(insight) # If no insights found, create from first few sentences if not insights: sentences = text.split('.') insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20] return insights def _extract_topics(self, text: str) -> List[str]: """Extract main topics from analysis""" # Simple keyword extraction - could be enhanced with NLP common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} words = text.lower().split() word_freq = {} for word in words: word = re.sub(r'[^\w\s]', '', word) if len(word) > 4 and word not in common_words: word_freq[word] = word_freq.get(word, 0) + 1 # Get top topics topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5] return [topic[0].title() for topic in topics] def _determine_complexity(self, text: str) -> str: """Determine content complexity level""" text_lower = text.lower() # Simple heuristic based on keywords if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']): return "beginner" elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']): return "advanced" else: return "intermediate" async def generate_script( self, analysis: DocumentAnalysis, style: str, duration_minutes: int ) -> PodcastScript: """ Generate podcast script from analysis Args: analysis: Document analysis results style: Podcast style duration_minutes: Target duration Returns: Complete podcast script """ # Calculate target word count target_words = duration_minutes * self.WORDS_PER_MINUTE # Prepare analysis context analysis_context = f""" KEY INSIGHTS: {chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))} TOPICS: {', '.join(analysis.topics)} COMPLEXITY: {analysis.complexity_level} SUMMARY: {analysis.summary[:500]}... """ # Get prompt template for style prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"]) # Fill in the template prompt = prompt_template.format( analysis=analysis_context, duration_minutes=duration_minutes, word_count=target_words ) # Generate script using LLM script_text = await self.llm_service.generate_text( prompt, max_tokens=target_words * 2, # Give room for generation temperature=0.8 # More creative ) # Parse script into dialogue lines dialogue = self._parse_script(script_text) # Calculate actual word count and duration word_count = sum(len(line.text.split()) for line in dialogue) duration_estimate = word_count / self.WORDS_PER_MINUTE return PodcastScript( dialogue=dialogue, total_duration_estimate=duration_estimate * 60, # Convert to seconds word_count=word_count, style=style ) def _parse_script(self, script_text: str) -> List[DialogueLine]: """Parse generated script into dialogue lines""" dialogue = [] lines = script_text.split('\n') for line in lines: line = line.strip() if not line: continue # Match "HOST1:" or "HOST2:" format if line.startswith('HOST1:'): text = line[6:].strip() if text: dialogue.append(DialogueLine(speaker="HOST1", text=text)) elif line.startswith('HOST2:'): text = line[6:].strip() if text: dialogue.append(DialogueLine(speaker="HOST2", text=text)) return dialogue def _get_voice_id(self, voice_name: str) -> str: """ Get voice ID from voice name. Falls back to first available voice if not found. Args: voice_name: Voice name (e.g., "Rachel", "Adam") Returns: Voice ID string """ try: # Try to get voices and find by name voices = self.elevenlabs_client.voices.get_all() if not voices or not voices.voices: logger.error("No voices available from ElevenLabs") raise RuntimeError("No voices available") # First, try exact name match for voice in voices.voices: if voice.name.lower() == voice_name.lower(): logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}") return voice.voice_id # Try partial match for voice in voices.voices: if voice_name.lower() in voice.name.lower(): logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})") return voice.voice_id # Use first available voice as fallback first_voice = voices.voices[0] logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})") return first_voice.voice_id except Exception as e: logger.error(f"Could not fetch voices: {e}", exc_info=True) raise RuntimeError(f"Failed to get voice ID: {str(e)}") async def synthesize_audio( self, podcast_id: str, script: PodcastScript, host1_voice: str, host2_voice: str ) -> Path: """ Synthesize audio from script using ElevenLabs Args: podcast_id: Unique podcast ID script: Podcast script host1_voice: Voice for HOST1 host2_voice: Voice for HOST2 Returns: Path to generated MP3 file """ if not self.elevenlabs_client: raise RuntimeError("ElevenLabs client not initialized") audio_file = self.podcast_dir / f"{podcast_id}.mp3" # For now, create a simple text-to-speech for the full script # In production, you'd combine segments with pauses full_text = script.to_text() # Get actual voice ID voice_id = self._get_voice_id(host1_voice) try: # Use modern ElevenLabs TTS API # Note: This is a simplified version using single voice # Full implementation would process each line separately with different voices logger.info(f"Generating audio with voice: {host1_voice}") # Use the modern text_to_speech API audio_generator = self.elevenlabs_client.text_to_speech.convert( voice_id=voice_id, # Using resolved voice ID text=full_text, model_id="eleven_multilingual_v2" ) # Write audio chunks to file with open(audio_file, 'wb') as f: for chunk in audio_generator: if chunk: f.write(chunk) # Verify file was created with content if audio_file.exists() and audio_file.stat().st_size > 1000: logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)") return audio_file else: raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes") except Exception as e: logger.error(f"Audio synthesis failed: {e}", exc_info=True) raise RuntimeError(f"Failed to generate podcast audio: {str(e)}") def _create_metadata( self, podcast_id: str, analysis: DocumentAnalysis, script: PodcastScript, audio_path: Path, voices: set, document_ids: List[str], style: str ) -> PodcastMetadata: """Create podcast metadata""" # Auto-generate title title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}" # Create description description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)." # Calculate file size file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0 # Estimate costs llm_cost = (script.word_count / 1000) * 0.01 # Rough estimate tts_cost = (script.word_count * 5 / 1000) * 0.30 # Rough estimate return PodcastMetadata( podcast_id=podcast_id, title=title, description=description, source_documents=document_ids, style=style, duration_seconds=script.total_duration_estimate, file_size_mb=file_size_mb, voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel", "host2": list(voices)[1] if len(voices) > 1 else "Adam"}, generated_at=datetime.now().isoformat(), generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost}, key_topics=analysis.topics ) def _save_metadata(self, metadata: PodcastMetadata): """Save metadata to database""" try: # Load existing metadata existing = json.loads(self.metadata_file.read_text()) # Add new metadata existing.append(asdict(metadata)) # Save back self.metadata_file.write_text(json.dumps(existing, indent=2)) logger.info(f"Metadata saved for podcast: {metadata.podcast_id}") except Exception as e: logger.error(f"Failed to save metadata: {e}") def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]: """List generated podcasts""" try: data = json.loads(self.metadata_file.read_text()) podcasts = [PodcastMetadata(**item) for item in data[-limit:]] return list(reversed(podcasts)) # Most recent first except Exception as e: logger.error(f"Failed to list podcasts: {e}") return [] def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]: """Get specific podcast metadata""" try: data = json.loads(self.metadata_file.read_text()) for item in data: if item.get('podcast_id') == podcast_id: return PodcastMetadata(**item) return None except Exception as e: logger.error(f"Failed to get podcast: {e}") return None