Spaces:

MCP-1st-Birthday
/

papercast

Running

papercast / agents /podcast_agent.py

batuhanozkose

feat: Implement Podcast Persona Framework (PPF) - Revolutionary adaptive conversation system

2bd49fc 26 days ago

15.6 kB

	import time

	from generation.script_generator import get_generator
	from processing.pdf_reader import extract_text_from_pdf
	from processing.url_fetcher import fetch_paper_from_url
	from synthesis.tts_engine import get_tts_engine
	from utils.config import (
	MAX_CONTEXT_CHARS,
	)
	from utils.history import save_to_history


	class PodcastAgent:
	def __init__(
	self,
	provider_mode="own_inference",
	own_base_url=None,
	own_api_key=None,
	own_model=None,
	openai_key=None,
	openai_model=None,
	tts_provider="elevenlabs",
	elevenlabs_key=None,
	host_voice=None,
	guest_voice=None,
	max_tokens=None,
	target_dialogue_count=15,
	context_limit=None,
	persona_mode="friendly_explainer",
	):
	"""
	Initialize PodcastAgent with user-provided settings (BYOK).

	Args:
	provider_mode: "own_inference" or "openai"
	own_base_url: Base URL for own inference server
	own_api_key: API key for own inference server
	own_model: Model name for own inference server
	openai_key: OpenAI API key
	openai_model: OpenAI model name
	tts_provider: "elevenlabs" (ElevenLabs required)
	elevenlabs_key: ElevenLabs API key (required)
	host_voice: Voice ID for host
	guest_voice: Voice ID for guest
	max_tokens: Maximum tokens for generation
	target_dialogue_count: Target number of dialogue exchanges (default: 15)
	context_limit: Maximum characters for multi-paper processing (default: MAX_CONTEXT_CHARS)
	persona_mode: Podcast persona mode (default: "friendly_explainer")
	"""
	self.logs = []
	self.provider_mode = provider_mode # "own_inference" or "openai"
	self.own_base_url = own_base_url
	self.own_api_key = own_api_key
	self.own_model = own_model
	self.openai_key = openai_key
	self.openai_model = openai_model
	self.tts_provider = tts_provider
	self.elevenlabs_key = elevenlabs_key
	self.host_voice = host_voice
	self.guest_voice = guest_voice
	self.max_tokens = max_tokens
	self.target_dialogue_count = target_dialogue_count
	self.context_limit = context_limit if context_limit else MAX_CONTEXT_CHARS
	self.persona_mode = persona_mode

	def log(self, message):
	timestamp = time.strftime("%H:%M:%S")
	entry = f"[{timestamp}] {message}"
	print(entry)
	self.logs.append(entry)
	return entry

	def process(self, url: str = None, pdf_file=None):
	"""
	Orchestrates the conversion from URL or uploaded PDF to Podcast.

	Args:
	url: Paper URL (arXiv or medRxiv)
	pdf_file: Uploaded PDF file object
	"""
	# Determine source
	if pdf_file:
	yield self.log(
	f"Received uploaded PDF: {pdf_file.name if hasattr(pdf_file, 'name') else 'file'}"
	)
	pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
	source_ref = "Uploaded PDF"
	elif url:
	yield self.log(f"Received request for URL: {url}")

	# Step 1: Fetch Paper
	yield self.log("Thinking: I need to download the paper first.")
	yield self.log(f"Tool Call: fetch_paper({url})")
	pdf_path = fetch_paper_from_url(url)
	if not pdf_path:
	yield self.log("Error: Failed to download paper.")
	return None, "\n".join(self.logs)
	yield self.log(f"Paper downloaded to: {pdf_path}")
	source_ref = url
	else:
	yield self.log(
	"Error: No input provided. Please provide either a URL or upload a PDF."
	)
	return None, "\n".join(self.logs)

	# Step 2: Read PDF
	yield self.log("Thinking: Now I need to extract the text content.")
	yield self.log(f"Tool Call: read_pdf({pdf_path})")
	text = extract_text_from_pdf(pdf_path)
	if not text:
	yield self.log("Error: Failed to extract text.")
	return None, self.logs

	text_length = len(text)
	yield self.log(f"Extracted {text_length:,} characters.")

	# Check context limit
	if text_length > self.context_limit:
	yield self.log(f"⚠️ Context limit exceeded!")
	yield self.log(f"Paper size: {text_length:,} characters")
	yield self.log(f"Maximum allowed: {self.context_limit:,} characters")
	yield self.log("")
	yield self.log("❌ Error: Paper is too large to process with current settings.")
	yield self.log("")
	yield self.log("📋 How to fix:")
	yield self.log(" 1. Open 'Advanced Options' below")
	yield self.log(" 2. Enable 'Batch Mode'")
	yield self.log(f" 3. Adjust 'Max Context Limit' slider to at least {text_length:,} characters")
	yield self.log(" 4. Try again")
	yield self.log("")
	return None, "\n".join(self.logs)

	# Step 3: Generate Script
	yield self.log(
	"Thinking: The text is ready. I will now generate a podcast script using the LLM."
	)
	if self.provider_mode == "demo":
	yield self.log("Using Demo Inference")
	elif self.provider_mode == "own_inference":
	yield self.log(f"Using Own Inference: {self.own_base_url}")
	else:
	yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})")
	yield self.log("Tool Call: generate_script(...)")
	generator = get_generator(
	provider_mode=self.provider_mode,
	own_base_url=self.own_base_url,
	own_api_key=self.own_api_key,
	own_model=self.own_model,
	openai_key=self.openai_key,
	openai_model=self.openai_model,
	max_tokens=self.max_tokens,
	)
	script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode)
	if not script:
	yield self.log("Error: Failed to generate script.")
	return None, self.logs
	yield self.log(f"Generated script with {len(script)} dialogue turns (target: {self.target_dialogue_count}).")

	# Step 4: Synthesize Audio
	yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
	tts_name = "ElevenLabs TTS" if self.tts_provider == "elevenlabs" else "Supertonic TTS (CPU)"
	yield self.log(f"Using {tts_name}")
	yield self.log("Tool Call: synthesize_podcast(...)")
	tts = get_tts_engine(
	tts_provider=self.tts_provider,
	custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None,
	host_voice=self.host_voice,
	guest_voice=self.guest_voice
	)
	audio_path = tts.synthesize_dialogue(script)

	if not audio_path:
	yield self.log("Error: Failed to synthesize audio.")
	return None, self.logs

	yield self.log(f"Podcast generated successfully at: {audio_path}")

	# Save to history
	save_to_history(source_ref, audio_path, len(script))
	yield self.log("✓ Saved to history")

	return audio_path, "\n".join(self.logs)

	def process_multiple(self, urls: list = None, pdf_files: list = None):
	"""
	Orchestrates the conversion from multiple URLs or PDFs to a single comprehensive Podcast.

	Args:
	urls: List of paper URLs (arXiv or medRxiv)
	pdf_files: List of uploaded PDF file objects
	"""
	all_texts = []
	source_refs = []
	total_chars = 0

	# Process URLs
	if urls:
	yield self.log(f"Received {len(urls)} URLs to process.")
	yield self.log(f"Context limit: {self.context_limit:,} characters")

	for i, url in enumerate(urls, 1):
	yield self.log(f"\n=== Processing Paper {i}/{len(urls)} ===")
	yield self.log(f"URL: {url}")

	# Step 1: Fetch Paper
	yield self.log(f"Tool Call: fetch_paper({url})")
	pdf_path = fetch_paper_from_url(url)
	if not pdf_path:
	yield self.log(f"Warning: Failed to download paper {i}, skipping.")
	continue

	yield self.log(f"Paper {i} downloaded successfully.")

	# Step 2: Read PDF
	yield self.log(f"Tool Call: read_pdf({pdf_path})")
	text = extract_text_from_pdf(pdf_path)
	if not text:
	yield self.log(
	f"Warning: Failed to extract text from paper {i}, skipping."
	)
	continue

	text_length = len(text)
	yield self.log(f"Extracted {text_length:,} characters from paper {i}.")

	# Check context limit
	if total_chars + text_length > self.context_limit:
	yield self.log(f"⚠️ Context limit reached!")
	yield self.log(
	f"Current total: {total_chars:,} chars + Paper {i}: {text_length:,} chars = {total_chars + text_length:,} chars"
	)
	yield self.log(f"Maximum allowed: {self.context_limit:,} chars")
	yield self.log(
	f"Stopping at {len(all_texts)} papers. Remaining papers will be skipped."
	)
	break

	all_texts.append(f"=== PAPER {i} ===\n{text}\n")
	source_refs.append(url)
	total_chars += text_length
	yield self.log(
	f"✓ Paper {i} added. Total context: {total_chars:,} chars ({(total_chars / self.context_limit) * 100:.1f}% of limit)"
	)

	# Process PDFs
	if pdf_files:
	yield self.log(f"\nReceived {len(pdf_files)} PDF files to process.")
	if not urls: # Only show limit if we didn't already show it for URLs
	yield self.log(f"Context limit: {self.context_limit:,} characters")

	for i, pdf_file in enumerate(pdf_files, 1):
	# Calculate paper number (continues from URL count)
	paper_num = (len(urls) if urls else 0) + i

	yield self.log(f"\n=== Processing PDF {i}/{len(pdf_files)} (Paper {paper_num}) ===")
	pdf_name = pdf_file.name if hasattr(pdf_file, "name") else f"file_{i}"
	yield self.log(f"File: {pdf_name}")

	pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file

	# Read PDF
	yield self.log(f"Tool Call: read_pdf({pdf_path})")
	text = extract_text_from_pdf(pdf_path)
	if not text:
	yield self.log(
	f"Warning: Failed to extract text from PDF {i}, skipping."
	)
	continue

	text_length = len(text)
	yield self.log(f"Extracted {text_length:,} characters from PDF {i}.")

	# Check context limit
	if total_chars + text_length > self.context_limit:
	yield self.log(f"⚠️ Context limit reached!")
	yield self.log(
	f"Current total: {total_chars:,} chars + PDF {i}: {text_length:,} chars = {total_chars + text_length:,} chars"
	)
	yield self.log(f"Maximum allowed: {self.context_limit:,} chars")
	yield self.log(
	f"Stopping at {len(all_texts)} files. Remaining items will be skipped."
	)
	break

	all_texts.append(f"=== PAPER {paper_num} ===\n{text}\n")
	source_refs.append(f"Uploaded PDF: {pdf_name}")
	total_chars += text_length
	yield self.log(
	f"✓ PDF {i} added. Total context: {total_chars:,} chars ({(total_chars / self.context_limit) * 100:.1f}% of limit)"
	)

	if not all_texts:
	yield self.log("Error: No papers were successfully processed.")
	return None, "\n".join(self.logs)

	# Combine all texts
	yield self.log(f"\n✓ Successfully processed {len(all_texts)} papers")
	yield self.log(
	f"Total context: {total_chars:,} characters ({(total_chars / self.context_limit) * 100:.1f}% of limit)"
	)
	yield self.log(
	f"Thinking: Now I'll combine all papers into a comprehensive podcast script."
	)

	combined_text = "\n\n".join(all_texts)

	# Step 3: Generate Comprehensive Script
	yield self.log(
	"\nThinking: Creating a comprehensive podcast script covering all papers."
	)
	if self.provider_mode == "demo":
	yield self.log("Using Demo Inference")
	elif self.provider_mode == "own_inference":
	yield self.log(f"Using Own Inference: {self.own_base_url}")
	else:
	yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})")
	yield self.log("Tool Call: generate_script(...)")
	generator = get_generator(
	provider_mode=self.provider_mode,
	own_base_url=self.own_base_url,
	own_api_key=self.own_api_key,
	own_model=self.own_model,
	openai_key=self.openai_key,
	openai_model=self.openai_model,
	max_tokens=self.max_tokens,
	)

	# Add instruction for multi-paper script
	multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
	script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode)

	if not script:
	yield self.log("Error: Failed to generate script.")
	return None, self.logs

	yield self.log(
	f"Generated comprehensive script with {len(script)} dialogue turns."
	)

	# Step 4: Synthesize Audio
	yield self.log(
	"\nThinking: The script looks good. Sending it to the TTS engine."
	)
	tts_name = "ElevenLabs TTS" if self.tts_provider == "elevenlabs" else "Supertonic TTS (CPU)"
	yield self.log(f"Using {tts_name}")
	yield self.log("Tool Call: synthesize_podcast(...)")
	tts = get_tts_engine(
	tts_provider=self.tts_provider,
	custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None,
	host_voice=self.host_voice,
	guest_voice=self.guest_voice
	)
	audio_path = tts.synthesize_dialogue(script)

	if not audio_path:
	yield self.log("Error: Failed to synthesize audio.")
	return None, self.logs

	yield self.log(f"Podcast generated successfully at: {audio_path}")

	# Save to history
	source_ref = f"Multiple papers: {', '.join(source_refs[:3])}{'...' if len(source_refs) > 3 else ''}"
	save_to_history(source_ref, audio_path, len(script))
	yield self.log("✓ Saved to history")

	return audio_path, "\n".join(self.logs)