Spaces:

MCP-1st-Birthday
/

AI-Digital-Library-Assistant

Running

App Files Files Community

AI-Digital-Library-Assistant / services /podcast_generator_service.py

Nihal2000

Initial deployment of AI Digital Library Assistant

86aa5e4 15 days ago

raw

history blame

23.6 kB

	import logging
	import asyncio
	import json
	import uuid
	from typing import List, Dict, Any, Optional
	from dataclasses import dataclass, asdict
	from datetime import datetime
	from pathlib import Path
	import re

	try:
	from elevenlabs import VoiceSettings
	from elevenlabs.client import ElevenLabs
	ELEVENLABS_AVAILABLE = True
	except ImportError:
	ELEVENLABS_AVAILABLE = False

	import config
	from services.llamaindex_service import LlamaIndexService
	from services.llm_service import LLMService

	logger = logging.getLogger(__name__)

	@dataclass
	class DocumentAnalysis:
	"""Analysis results from document(s)"""
	key_insights: List[str] # 5-7 main points
	topics: List[str]
	complexity_level: str # beginner, intermediate, advanced
	estimated_words: int
	source_documents: List[str]
	summary: str

	@dataclass
	class DialogueLine:
	"""Single line of podcast dialogue"""
	speaker: str # "HOST1" or "HOST2"
	text: str
	pause_after: float = 0.5 # seconds

	@dataclass
	class PodcastScript:
	"""Complete podcast script"""
	dialogue: List[DialogueLine]
	total_duration_estimate: float
	word_count: int
	style: str

	def to_text(self) -> str:
	"""Convert to readable transcript"""
	lines = []
	for line in self.dialogue:
	lines.append(f"{line.speaker}: {line.text}")
	return "\n\n".join(lines)

	@dataclass
	class PodcastMetadata:
	"""Metadata for generated podcast"""
	podcast_id: str
	title: str
	description: str
	source_documents: List[str]
	style: str
	duration_seconds: float
	file_size_mb: float
	voices: Dict[str, str]
	generated_at: str
	generation_cost: Dict[str, float]
	key_topics: List[str]

	@dataclass
	class PodcastResult:
	"""Complete podcast generation result"""
	podcast_id: str
	audio_file_path: str
	transcript: str
	metadata: PodcastMetadata
	generation_time: float
	success: bool
	error: Optional[str] = None


	class PodcastGeneratorService:
	"""
	Service for generating conversational podcasts from documents.
	Combines LlamaIndex for analysis and ElevenLabs for voice synthesis.
	"""

	# Word count per minute for podcast pacing
	WORDS_PER_MINUTE = 150

	# Script generation prompts for different styles
	SCRIPT_PROMPTS = {
	"conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents.

	CONTEXT:
	{analysis}

	REQUIREMENTS:
	- Duration: {duration_minutes} minutes (approximately {word_count} words)
	- Style: Conversational, friendly, and accessible
	- Format: Alternating dialogue between HOST1 and HOST2
	- Include natural transitions, questions, and "aha!" moments
	- Make complex topics easy to understand
	- Add enthusiasm and genuine curiosity
	- Balance speaking time between both hosts

	DIALOGUE FORMAT (strictly follow):
	HOST1: [What they say]
	HOST2: [What they say]

	STRUCTURE:
	1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact
	2. Introduction (1 minute): Set context and preview what's coming
	3. Main Discussion (70% of time): Deep dive into key insights
	4. Wrap-up (1 minute): Summarize key takeaways and final thoughts

	TONE: Friendly, enthusiastic, educational but not condescending

	Generate the complete podcast script now:""",

	"educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner.

	CONTEXT:
	{analysis}

	REQUIREMENTS:
	- Duration: {duration_minutes} minutes (approximately {word_count} words)
	- Style: Clear, methodical, educational
	- HOST1 acts as the teacher/expert, HOST2 as the curious learner
	- Include explanations of complex concepts
	- Use examples and analogies
	- Build knowledge progressively

	DIALOGUE FORMAT:
	HOST1: [Expert explanation]
	HOST2: [Clarifying question or observation]

	Generate the complete educational podcast script now:""",

	"technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth.

	CONTEXT:
	{analysis}

	REQUIREMENTS:
	- Duration: {duration_minutes} minutes (approximately {word_count} words)
	- Style: Professional, detailed, technically accurate
	- HOST1 is the subject matter expert, HOST2 is an informed interviewer
	- Use proper technical terminology
	- Dive into implementation details
	- Discuss implications and applications

	DIALOGUE FORMAT:
	HOST1: [Technical insight]
	HOST2: [Probing question]

	Generate the complete technical podcast script now:""",

	"casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents.

	CONTEXT:
	{analysis}

	REQUIREMENTS:
	- Duration: {duration_minutes} minutes (approximately {word_count} words)
	- Style: Relaxed, humorous, energetic
	- Both hosts are enthusiastic and engaged
	- Use casual language and occasional humor
	- Make it entertaining while staying informative
	- Quick pacing with energy

	DIALOGUE FORMAT:
	HOST1: [Casual commentary]
	HOST2: [Enthusiastic response]

	Generate the complete casual podcast script now:"""
	}

	def __init__(
	self,
	llamaindex_service: LlamaIndexService,
	llm_service: LLMService,
	elevenlabs_api_key: Optional[str] = None
	):
	"""
	Initialize podcast generator service

	Args:
	llamaindex_service: Service for document analysis
	llm_service: Service for script generation
	elevenlabs_api_key: ElevenLabs API key (uses config if not provided)
	"""
	self.config = config.config
	self.llamaindex_service = llamaindex_service
	self.llm_service = llm_service

	# Initialize ElevenLabs client
	self.elevenlabs_client = None
	if ELEVENLABS_AVAILABLE:
	api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
	if api_key:
	try:
	self.elevenlabs_client = ElevenLabs(api_key=api_key)
	logger.info("ElevenLabs client initialized for podcast generation")
	except Exception as e:
	logger.error(f"Failed to initialize ElevenLabs client: {e}")

	# Create podcast storage directory
	self.podcast_dir = Path("./data/podcasts")
	self.podcast_dir.mkdir(parents=True, exist_ok=True)

	# Metadata database file
	self.metadata_file = self.podcast_dir / "metadata_db.json"
	self._ensure_metadata_db()

	def _ensure_metadata_db(self):
	"""Ensure metadata database exists"""
	if not self.metadata_file.exists():
	self.metadata_file.write_text(json.dumps([], indent=2))

	async def generate_podcast(
	self,
	document_ids: List[str],
	style: str = "conversational",
	duration_minutes: int = 10,
	host1_voice: str = "Rachel",
	host2_voice: str = "Adam"
	) -> PodcastResult:
	"""
	Generate a complete podcast from documents

	Args:
	document_ids: List of document IDs to analyze
	style: Podcast style (conversational, educational, technical, casual)
	duration_minutes: Target duration in minutes
	host1_voice: Voice name for first host
	host2_voice: Voice name for second host

	Returns:
	PodcastResult with audio file path and metadata
	"""
	start_time = datetime.now()
	podcast_id = str(uuid.uuid4())

	try:
	logger.info(f"Starting podcast generation {podcast_id}")
	logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")

	# Step 1: Analyze documents
	logger.info("Step 1: Analyzing documents...")
	analysis = await self.analyze_documents(document_ids)

	# Step 2: Generate script
	logger.info("Step 2: Generating podcast script...")
	script = await self.generate_script(analysis, style, duration_minutes)

	# Step 3: Synthesize audio
	logger.info("Step 3: Synthesizing audio with voices...")
	audio_file_path = await self.synthesize_audio(
	podcast_id,
	script,
	host1_voice,
	host2_voice
	)

	# Calculate generation time
	generation_time = (datetime.now() - start_time).total_seconds()

	# Step 4: Create metadata
	logger.info("Step 4: Creating metadata...")
	metadata = self._create_metadata(
	podcast_id,
	analysis,
	script,
	audio_file_path,
	{host1_voice, host2_voice},
	document_ids,
	style
	)

	# Save metadata
	self._save_metadata(metadata)

	# Save transcript
	transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
	transcript_path.write_text(script.to_text(), encoding="utf-8")

	logger.info(f"Podcast generated successfully: {podcast_id}")

	return PodcastResult(
	podcast_id=podcast_id,
	audio_file_path=str(audio_file_path),
	transcript=script.to_text(),
	metadata=metadata,
	generation_time=generation_time,
	success=True
	)

	except Exception as e:
	logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
	return PodcastResult(
	podcast_id=podcast_id,
	audio_file_path="",
	transcript="",
	metadata=None,
	generation_time=(datetime.now() - start_time).total_seconds(),
	success=False,
	error=str(e)
	)

	async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
	"""
	Analyze documents to extract key insights for podcast

	Args:
	document_ids: List of document IDs

	Returns:
	DocumentAnalysis with key insights and topics
	"""
	# Create analysis query for the agentic RAG
	analysis_query = f"""Analyze the following documents and provide:
	1. The 5-7 most important insights or key points
	2. Main themes and topics covered
	3. The overall complexity level (beginner/intermediate/advanced)
	4. A brief summary suitable for podcast discussion

	Document IDs: {', '.join(document_ids)}

	Provide a structured analysis optimized for creating an engaging podcast discussion."""

	# Use LlamaIndex agentic RAG for analysis
	result = await self.llamaindex_service.query(analysis_query)

	# Parse the result to extract structured information
	# This is a simplified parser - in production, you might want more robust parsing
	insights = self._extract_insights(result)
	topics = self._extract_topics(result)
	complexity = self._determine_complexity(result)

	return DocumentAnalysis(
	key_insights=insights[:7], # Limit to 7
	topics=topics,
	complexity_level=complexity,
	estimated_words=len(result.split()),
	source_documents=document_ids,
	summary=result
	)

	def _extract_insights(self, text: str) -> List[str]:
	"""Extract key insights from analysis text"""
	insights = []
	#Simple extraction based on numbered lists or bullet points
	lines = text.split('\n')
	for line in lines:
	line = line.strip()
	# Match patterns like "1.", "2.", "-", "*", "•"
	if re.match(r'^\d+\.\|\-\|\*\|•', line):
	insight = re.sub(r'^\d+\.\|\-\|\*\|•', '', line).strip()
	if len(insight) > 20: # Ensure it's substantial
	insights.append(insight)

	# If no insights found, create from first few sentences
	if not insights:
	sentences = text.split('.')
	insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]

	return insights

	def _extract_topics(self, text: str) -> List[str]:
	"""Extract main topics from analysis"""
	# Simple keyword extraction - could be enhanced with NLP
	common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
	words = text.lower().split()
	word_freq = {}

	for word in words:
	word = re.sub(r'[^\w\s]', '', word)
	if len(word) > 4 and word not in common_words:
	word_freq[word] = word_freq.get(word, 0) + 1

	# Get top topics
	topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
	return [topic[0].title() for topic in topics]

	def _determine_complexity(self, text: str) -> str:
	"""Determine content complexity level"""
	text_lower = text.lower()

	# Simple heuristic based on keywords
	if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
	return "beginner"
	elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
	return "advanced"
	else:
	return "intermediate"

	async def generate_script(
	self,
	analysis: DocumentAnalysis,
	style: str,
	duration_minutes: int
	) -> PodcastScript:
	"""
	Generate podcast script from analysis

	Args:
	analysis: Document analysis results
	style: Podcast style
	duration_minutes: Target duration

	Returns:
	Complete podcast script
	"""
	# Calculate target word count
	target_words = duration_minutes * self.WORDS_PER_MINUTE

	# Prepare analysis context
	analysis_context = f"""
	KEY INSIGHTS:
	{chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))}

	TOPICS: {', '.join(analysis.topics)}
	COMPLEXITY: {analysis.complexity_level}

	SUMMARY:
	{analysis.summary[:500]}...
	"""

	# Get prompt template for style
	prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])

	# Fill in the template
	prompt = prompt_template.format(
	analysis=analysis_context,
	duration_minutes=duration_minutes,
	word_count=target_words
	)

	# Generate script using LLM
	script_text = await self.llm_service.generate_text(
	prompt,
	max_tokens=target_words * 2, # Give room for generation
	temperature=0.8 # More creative
	)

	# Parse script into dialogue lines
	dialogue = self._parse_script(script_text)

	# Calculate actual word count and duration
	word_count = sum(len(line.text.split()) for line in dialogue)
	duration_estimate = word_count / self.WORDS_PER_MINUTE

	return PodcastScript(
	dialogue=dialogue,
	total_duration_estimate=duration_estimate * 60, # Convert to seconds
	word_count=word_count,
	style=style
	)

	def _parse_script(self, script_text: str) -> List[DialogueLine]:
	"""Parse generated script into dialogue lines"""
	dialogue = []
	lines = script_text.split('\n')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Match "HOST1:" or "HOST2:" format
	if line.startswith('HOST1:'):
	text = line[6:].strip()
	if text:
	dialogue.append(DialogueLine(speaker="HOST1", text=text))
	elif line.startswith('HOST2:'):
	text = line[6:].strip()
	if text:
	dialogue.append(DialogueLine(speaker="HOST2", text=text))

	return dialogue

	def _get_voice_id(self, voice_name: str) -> str:
	"""
	Get voice ID from voice name.
	Falls back to first available voice if not found.

	Args:
	voice_name: Voice name (e.g., "Rachel", "Adam")

	Returns:
	Voice ID string
	"""
	try:
	# Try to get voices and find by name
	voices = self.elevenlabs_client.voices.get_all()

	if not voices or not voices.voices:
	logger.error("No voices available from ElevenLabs")
	raise RuntimeError("No voices available")

	# First, try exact name match
	for voice in voices.voices:
	if voice.name.lower() == voice_name.lower():
	logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}")
	return voice.voice_id

	# Try partial match
	for voice in voices.voices:
	if voice_name.lower() in voice.name.lower():
	logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})")
	return voice.voice_id

	# Use first available voice as fallback
	first_voice = voices.voices[0]
	logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})")
	return first_voice.voice_id

	except Exception as e:
	logger.error(f"Could not fetch voices: {e}", exc_info=True)
	raise RuntimeError(f"Failed to get voice ID: {str(e)}")

	async def synthesize_audio(
	self,
	podcast_id: str,
	script: PodcastScript,
	host1_voice: str,
	host2_voice: str
	) -> Path:
	"""
	Synthesize audio from script using ElevenLabs

	Args:
	podcast_id: Unique podcast ID
	script: Podcast script
	host1_voice: Voice for HOST1
	host2_voice: Voice for HOST2

	Returns:
	Path to generated MP3 file
	"""
	if not self.elevenlabs_client:
	raise RuntimeError("ElevenLabs client not initialized")

	audio_file = self.podcast_dir / f"{podcast_id}.mp3"

	# For now, create a simple text-to-speech for the full script
	# In production, you'd combine segments with pauses
	full_text = script.to_text()

	# Get actual voice ID
	voice_id = self._get_voice_id(host1_voice)

	try:
	# Use modern ElevenLabs TTS API
	# Note: This is a simplified version using single voice
	# Full implementation would process each line separately with different voices

	logger.info(f"Generating audio with voice: {host1_voice}")

	# Use the modern text_to_speech API
	audio_generator = self.elevenlabs_client.text_to_speech.convert(
	voice_id=voice_id, # Using resolved voice ID
	text=full_text,
	model_id="eleven_multilingual_v2"
	)

	# Write audio chunks to file
	with open(audio_file, 'wb') as f:
	for chunk in audio_generator:
	if chunk:
	f.write(chunk)

	# Verify file was created with content
	if audio_file.exists() and audio_file.stat().st_size > 1000:
	logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)")
	return audio_file
	else:
	raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes")

	except Exception as e:
	logger.error(f"Audio synthesis failed: {e}", exc_info=True)
	raise RuntimeError(f"Failed to generate podcast audio: {str(e)}")

	def _create_metadata(
	self,
	podcast_id: str,
	analysis: DocumentAnalysis,
	script: PodcastScript,
	audio_path: Path,
	voices: set,
	document_ids: List[str],
	style: str
	) -> PodcastMetadata:
	"""Create podcast metadata"""
	# Auto-generate title
	title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"

	# Create description
	description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)."

	# Calculate file size
	file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0

	# Estimate costs
	llm_cost = (script.word_count / 1000) * 0.01 # Rough estimate
	tts_cost = (script.word_count * 5 / 1000) * 0.30 # Rough estimate

	return PodcastMetadata(
	podcast_id=podcast_id,
	title=title,
	description=description,
	source_documents=document_ids,
	style=style,
	duration_seconds=script.total_duration_estimate,
	file_size_mb=file_size_mb,
	voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
	"host2": list(voices)[1] if len(voices) > 1 else "Adam"},
	generated_at=datetime.now().isoformat(),
	generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
	key_topics=analysis.topics
	)

	def _save_metadata(self, metadata: PodcastMetadata):
	"""Save metadata to database"""
	try:
	# Load existing metadata
	existing = json.loads(self.metadata_file.read_text())

	# Add new metadata
	existing.append(asdict(metadata))

	# Save back
	self.metadata_file.write_text(json.dumps(existing, indent=2))

	logger.info(f"Metadata saved for podcast: {metadata.podcast_id}")

	except Exception as e:
	logger.error(f"Failed to save metadata: {e}")

	def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
	"""List generated podcasts"""
	try:
	data = json.loads(self.metadata_file.read_text())
	podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
	return list(reversed(podcasts)) # Most recent first
	except Exception as e:
	logger.error(f"Failed to list podcasts: {e}")
	return []

	def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
	"""Get specific podcast metadata"""
	try:
	data = json.loads(self.metadata_file.read_text())
	for item in data:
	if item.get('podcast_id') == podcast_id:
	return PodcastMetadata(**item)
	return None
	except Exception as e:
	logger.error(f"Failed to get podcast: {e}")
	return None