Spaces:
Running
Running
batuhanozkose
commited on
Commit
·
3ab234f
1
Parent(s):
472739a
update tts, llm engines
Browse files- agents/podcast_agent.py +36 -49
- app.py +295 -336
- generation/script_generator.py +121 -69
- live.py +36 -21
- output/history.json +7 -0
- requirements.txt +0 -1
- synthesis/tts_engine.py +55 -202
- utils/config.py +6 -22
agents/podcast_agent.py
CHANGED
|
@@ -5,11 +5,6 @@ from processing.pdf_reader import extract_text_from_pdf
|
|
| 5 |
from processing.url_fetcher import fetch_paper_from_url
|
| 6 |
from synthesis.tts_engine import get_tts_engine
|
| 7 |
from utils.config import (
|
| 8 |
-
DEMO_INFERENCE_KEY,
|
| 9 |
-
DEMO_INFERENCE_URL,
|
| 10 |
-
DEMO_MODE,
|
| 11 |
-
DEMO_MODEL,
|
| 12 |
-
DEMO_TTS_KEY,
|
| 13 |
MAX_CONTEXT_CHARS,
|
| 14 |
)
|
| 15 |
from utils.history import save_to_history
|
|
@@ -18,45 +13,49 @@ from utils.history import save_to_history
|
|
| 18 |
class PodcastAgent:
|
| 19 |
def __init__(
|
| 20 |
self,
|
| 21 |
-
provider_mode="
|
| 22 |
own_base_url=None,
|
| 23 |
own_api_key=None,
|
| 24 |
own_model=None,
|
| 25 |
openai_key=None,
|
| 26 |
openai_model=None,
|
| 27 |
-
tts_provider="
|
| 28 |
elevenlabs_key=None,
|
| 29 |
host_voice=None,
|
| 30 |
guest_voice=None,
|
| 31 |
max_tokens=None,
|
|
|
|
| 32 |
):
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# If demo mode is enabled, override all settings with demo credentials
|
| 36 |
-
if DEMO_MODE:
|
| 37 |
-
self.provider_mode = "demo"
|
| 38 |
-
self.own_base_url = DEMO_INFERENCE_URL
|
| 39 |
-
self.own_api_key = DEMO_INFERENCE_KEY
|
| 40 |
-
self.own_model = DEMO_MODEL
|
| 41 |
-
self.openai_key = None
|
| 42 |
-
self.openai_model = None
|
| 43 |
-
self.tts_provider = "edge-tts" # Always use Edge-TTS in demo mode
|
| 44 |
-
self.elevenlabs_key = None
|
| 45 |
-
self.host_voice = host_voice
|
| 46 |
-
self.guest_voice = guest_voice
|
| 47 |
-
else:
|
| 48 |
-
self.provider_mode = provider_mode # "own_inference" or "openai"
|
| 49 |
-
self.own_base_url = own_base_url
|
| 50 |
-
self.own_api_key = own_api_key
|
| 51 |
-
self.own_model = own_model
|
| 52 |
-
self.openai_key = openai_key
|
| 53 |
-
self.openai_model = openai_model
|
| 54 |
-
self.tts_provider = tts_provider
|
| 55 |
-
self.elevenlabs_key = elevenlabs_key
|
| 56 |
-
self.host_voice = host_voice
|
| 57 |
-
self.guest_voice = guest_voice
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
self.max_tokens = max_tokens
|
|
|
|
| 60 |
|
| 61 |
def log(self, message):
|
| 62 |
timestamp = time.strftime("%H:%M:%S")
|
|
@@ -127,21 +126,15 @@ class PodcastAgent:
|
|
| 127 |
openai_model=self.openai_model,
|
| 128 |
max_tokens=self.max_tokens,
|
| 129 |
)
|
| 130 |
-
script = generator.generate_podcast_script(text)
|
| 131 |
if not script:
|
| 132 |
yield self.log("Error: Failed to generate script.")
|
| 133 |
return None, self.logs
|
| 134 |
-
yield self.log(f"Generated script with {len(script)} dialogue turns.")
|
| 135 |
|
| 136 |
# Step 4: Synthesize Audio
|
| 137 |
yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
|
| 138 |
-
|
| 139 |
-
yield self.log("Using Edge-TTS (Microsoft, free)")
|
| 140 |
-
elif self.tts_provider == "elevenlabs":
|
| 141 |
-
if self.elevenlabs_key:
|
| 142 |
-
yield self.log("Using custom ElevenLabs API key")
|
| 143 |
-
else:
|
| 144 |
-
yield self.log("Using demo ElevenLabs key")
|
| 145 |
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 146 |
tts = get_tts_engine(
|
| 147 |
tts_provider=self.tts_provider,
|
|
@@ -305,7 +298,7 @@ class PodcastAgent:
|
|
| 305 |
|
| 306 |
# Add instruction for multi-paper script
|
| 307 |
multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
|
| 308 |
-
script = generator.generate_podcast_script(multi_paper_prompt)
|
| 309 |
|
| 310 |
if not script:
|
| 311 |
yield self.log("Error: Failed to generate script.")
|
|
@@ -319,13 +312,7 @@ class PodcastAgent:
|
|
| 319 |
yield self.log(
|
| 320 |
"\nThinking: The script looks good. Sending it to the TTS engine."
|
| 321 |
)
|
| 322 |
-
|
| 323 |
-
yield self.log("Using Edge-TTS (Microsoft, free)")
|
| 324 |
-
elif self.tts_provider == "elevenlabs":
|
| 325 |
-
if self.elevenlabs_key:
|
| 326 |
-
yield self.log("Using custom ElevenLabs API key")
|
| 327 |
-
else:
|
| 328 |
-
yield self.log("Using demo ElevenLabs key")
|
| 329 |
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 330 |
tts = get_tts_engine(
|
| 331 |
tts_provider=self.tts_provider,
|
|
|
|
| 5 |
from processing.url_fetcher import fetch_paper_from_url
|
| 6 |
from synthesis.tts_engine import get_tts_engine
|
| 7 |
from utils.config import (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
MAX_CONTEXT_CHARS,
|
| 9 |
)
|
| 10 |
from utils.history import save_to_history
|
|
|
|
| 13 |
class PodcastAgent:
|
| 14 |
def __init__(
|
| 15 |
self,
|
| 16 |
+
provider_mode="own_inference",
|
| 17 |
own_base_url=None,
|
| 18 |
own_api_key=None,
|
| 19 |
own_model=None,
|
| 20 |
openai_key=None,
|
| 21 |
openai_model=None,
|
| 22 |
+
tts_provider="elevenlabs",
|
| 23 |
elevenlabs_key=None,
|
| 24 |
host_voice=None,
|
| 25 |
guest_voice=None,
|
| 26 |
max_tokens=None,
|
| 27 |
+
target_dialogue_count=15,
|
| 28 |
):
|
| 29 |
+
"""
|
| 30 |
+
Initialize PodcastAgent with user-provided settings (BYOK).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
Args:
|
| 33 |
+
provider_mode: "own_inference" or "openai"
|
| 34 |
+
own_base_url: Base URL for own inference server
|
| 35 |
+
own_api_key: API key for own inference server
|
| 36 |
+
own_model: Model name for own inference server
|
| 37 |
+
openai_key: OpenAI API key
|
| 38 |
+
openai_model: OpenAI model name
|
| 39 |
+
tts_provider: "elevenlabs" (ElevenLabs required)
|
| 40 |
+
elevenlabs_key: ElevenLabs API key (required)
|
| 41 |
+
host_voice: Voice ID for host
|
| 42 |
+
guest_voice: Voice ID for guest
|
| 43 |
+
max_tokens: Maximum tokens for generation
|
| 44 |
+
target_dialogue_count: Target number of dialogue exchanges (default: 15)
|
| 45 |
+
"""
|
| 46 |
+
self.logs = []
|
| 47 |
+
self.provider_mode = provider_mode # "own_inference" or "openai"
|
| 48 |
+
self.own_base_url = own_base_url
|
| 49 |
+
self.own_api_key = own_api_key
|
| 50 |
+
self.own_model = own_model
|
| 51 |
+
self.openai_key = openai_key
|
| 52 |
+
self.openai_model = openai_model
|
| 53 |
+
self.tts_provider = tts_provider
|
| 54 |
+
self.elevenlabs_key = elevenlabs_key
|
| 55 |
+
self.host_voice = host_voice
|
| 56 |
+
self.guest_voice = guest_voice
|
| 57 |
self.max_tokens = max_tokens
|
| 58 |
+
self.target_dialogue_count = target_dialogue_count
|
| 59 |
|
| 60 |
def log(self, message):
|
| 61 |
timestamp = time.strftime("%H:%M:%S")
|
|
|
|
| 126 |
openai_model=self.openai_model,
|
| 127 |
max_tokens=self.max_tokens,
|
| 128 |
)
|
| 129 |
+
script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count)
|
| 130 |
if not script:
|
| 131 |
yield self.log("Error: Failed to generate script.")
|
| 132 |
return None, self.logs
|
| 133 |
+
yield self.log(f"Generated script with {len(script)} dialogue turns (target: {self.target_dialogue_count}).")
|
| 134 |
|
| 135 |
# Step 4: Synthesize Audio
|
| 136 |
yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
|
| 137 |
+
yield self.log("Using ElevenLabs TTS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 139 |
tts = get_tts_engine(
|
| 140 |
tts_provider=self.tts_provider,
|
|
|
|
| 298 |
|
| 299 |
# Add instruction for multi-paper script
|
| 300 |
multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
|
| 301 |
+
script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count)
|
| 302 |
|
| 303 |
if not script:
|
| 304 |
yield self.log("Error: Failed to generate script.")
|
|
|
|
| 312 |
yield self.log(
|
| 313 |
"\nThinking: The script looks good. Sending it to the TTS engine."
|
| 314 |
)
|
| 315 |
+
yield self.log("Using ElevenLabs TTS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 317 |
tts = get_tts_engine(
|
| 318 |
tts_provider=self.tts_provider,
|
app.py
CHANGED
|
@@ -4,13 +4,8 @@ from datetime import datetime
|
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
from agents.podcast_agent import PodcastAgent
|
| 7 |
-
from synthesis.tts_engine import
|
| 8 |
from utils.config import (
|
| 9 |
-
DEMO_INFERENCE_KEY,
|
| 10 |
-
DEMO_INFERENCE_URL,
|
| 11 |
-
DEMO_MODE,
|
| 12 |
-
DEMO_MODEL,
|
| 13 |
-
DEMO_TTS_KEY,
|
| 14 |
OUTPUT_DIR,
|
| 15 |
SCRIPT_GENERATION_MODEL,
|
| 16 |
)
|
|
@@ -20,19 +15,39 @@ from utils.history import get_history_items, load_history
|
|
| 20 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def validate_settings_for_generation(
|
| 24 |
llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key
|
| 25 |
):
|
| 26 |
"""
|
| 27 |
-
Validate user settings for podcast generation
|
| 28 |
|
| 29 |
Returns:
|
| 30 |
tuple: (is_valid, error_message)
|
| 31 |
"""
|
| 32 |
-
# Skip validation if in demo mode
|
| 33 |
-
if DEMO_MODE:
|
| 34 |
-
return True, ""
|
| 35 |
-
|
| 36 |
errors = []
|
| 37 |
|
| 38 |
# Validate LLM settings
|
|
@@ -52,13 +67,11 @@ def validate_settings_for_generation(
|
|
| 52 |
elif not openai_key.startswith("sk-"):
|
| 53 |
errors.append("❌ **OpenAI**: API key must start with 'sk-'")
|
| 54 |
|
| 55 |
-
# Validate TTS settings
|
| 56 |
-
if
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
errors.append("❌ **ElevenLabs**: API key must start with 'sk_'")
|
| 61 |
-
# Edge-TTS doesn't require any validation (it's free)
|
| 62 |
|
| 63 |
if errors:
|
| 64 |
return False, "\n".join(errors)
|
|
@@ -189,16 +202,17 @@ def run_agent(
|
|
| 189 |
user_podcast_length,
|
| 190 |
progress=gr.Progress(),
|
| 191 |
):
|
| 192 |
-
"""Run podcast generation with
|
| 193 |
|
| 194 |
# Determine provider mode
|
| 195 |
-
if
|
| 196 |
-
provider_mode = "demo"
|
| 197 |
-
elif user_llm_choice == "Own Inference":
|
| 198 |
provider_mode = "own_inference"
|
| 199 |
else: # OpenAI
|
| 200 |
provider_mode = "openai"
|
| 201 |
|
|
|
|
|
|
|
|
|
|
| 202 |
agent = PodcastAgent(
|
| 203 |
provider_mode=provider_mode,
|
| 204 |
own_base_url=user_own_base_url if user_own_base_url else None,
|
|
@@ -206,34 +220,23 @@ def run_agent(
|
|
| 206 |
own_model=user_own_model if user_own_model else None,
|
| 207 |
openai_key=user_openai_key if user_openai_key else None,
|
| 208 |
openai_model=user_openai_model if user_openai_model else None,
|
| 209 |
-
tts_provider=user_tts_provider if user_tts_provider else "
|
| 210 |
elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
|
| 211 |
host_voice=user_host_voice if user_host_voice else None,
|
| 212 |
guest_voice=user_guest_voice if user_guest_voice else None,
|
| 213 |
-
max_tokens=
|
|
|
|
| 214 |
)
|
| 215 |
logs_history = ""
|
| 216 |
|
| 217 |
# Log settings being used
|
| 218 |
settings_log = "Settings: "
|
| 219 |
-
if provider_mode == "
|
| 220 |
-
settings_log += "LLM: Demo Inference | TTS: Edge-TTS (Microsoft) | "
|
| 221 |
-
elif provider_mode == "own_inference":
|
| 222 |
settings_log += f"LLM: Own Inference | "
|
| 223 |
-
|
| 224 |
-
settings_log += "TTS: Edge-TTS (Microsoft) | "
|
| 225 |
-
elif user_elevenlabs_key:
|
| 226 |
-
settings_log += "TTS: Custom ElevenLabs | "
|
| 227 |
-
else:
|
| 228 |
-
settings_log += "TTS: ElevenLabs (no key provided) | "
|
| 229 |
else: # openai
|
| 230 |
settings_log += f"LLM: OpenAI ({user_openai_model or 'gpt-4o-mini'}) | "
|
| 231 |
-
|
| 232 |
-
settings_log += "TTS: Edge-TTS (Microsoft) | "
|
| 233 |
-
elif user_elevenlabs_key:
|
| 234 |
-
settings_log += "TTS: Custom ElevenLabs | "
|
| 235 |
-
else:
|
| 236 |
-
settings_log += "TTS: ElevenLabs (no key provided) | "
|
| 237 |
|
| 238 |
settings_log += (
|
| 239 |
f"Length: {user_podcast_length if user_podcast_length else 4096} tokens"
|
|
@@ -385,34 +388,23 @@ def main():
|
|
| 385 |
)
|
| 386 |
|
| 387 |
with gr.Blocks(title="PaperCast", theme=theme) as demo:
|
| 388 |
-
# Session state for settings
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
else:
|
| 401 |
-
user_llm_choice = gr.State(value="Own Inference")
|
| 402 |
-
user_own_base_url = gr.State(value="")
|
| 403 |
-
user_own_api_key = gr.State(value="")
|
| 404 |
-
user_own_model = gr.State(value="")
|
| 405 |
-
user_openai_key = gr.State(value="")
|
| 406 |
-
user_openai_model = gr.State(value="")
|
| 407 |
-
user_tts_provider = gr.State(value="edge-tts")
|
| 408 |
-
user_elevenlabs_key = gr.State(value="")
|
| 409 |
-
user_host_voice = gr.State(value="en-US-GuyNeural")
|
| 410 |
-
user_guest_voice = gr.State(value="en-US-JennyNeural")
|
| 411 |
user_podcast_length = gr.State(value=4096)
|
| 412 |
-
settings_valid = gr.State(value=
|
| 413 |
|
| 414 |
-
# Initialize generate button state
|
| 415 |
-
generate_btn_state = gr.State(value=
|
| 416 |
|
| 417 |
with gr.Row():
|
| 418 |
gr.HTML("""
|
|
@@ -718,27 +710,22 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 718 |
)
|
| 719 |
|
| 720 |
with gr.Group():
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
"
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
],
|
| 731 |
-
value="Own Inference",
|
| 732 |
-
label="Language Model Provider",
|
| 733 |
-
info="Choose your language model provider for script generation",
|
| 734 |
-
)
|
| 735 |
|
| 736 |
# Own Inference inputs (base URL + API key)
|
| 737 |
own_inference_base_url = gr.Textbox(
|
| 738 |
label="Base URL",
|
| 739 |
placeholder="https://your-server.com/v1",
|
| 740 |
info="OpenAI-compatible endpoint",
|
| 741 |
-
visible=
|
| 742 |
)
|
| 743 |
|
| 744 |
own_inference_api_key = gr.Textbox(
|
|
@@ -746,14 +733,14 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 746 |
placeholder="Optional - leave empty if not required",
|
| 747 |
type="password",
|
| 748 |
info="API key for your inference server (if required)",
|
| 749 |
-
visible=
|
| 750 |
)
|
| 751 |
|
| 752 |
own_inference_model = gr.Textbox(
|
| 753 |
label="Model Name",
|
| 754 |
placeholder="e.g., llama-3.1-8b, mistral-7b",
|
| 755 |
info="Model name on your server",
|
| 756 |
-
visible=
|
| 757 |
)
|
| 758 |
|
| 759 |
# OpenAI inputs
|
|
@@ -762,7 +749,7 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 762 |
placeholder="sk-...",
|
| 763 |
type="password",
|
| 764 |
info="Required when using OpenAI",
|
| 765 |
-
visible=False, # Hidden by default
|
| 766 |
)
|
| 767 |
|
| 768 |
openai_model_input = gr.Textbox(
|
|
@@ -770,149 +757,92 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 770 |
placeholder="gpt-4o-mini",
|
| 771 |
value="gpt-4o-mini",
|
| 772 |
info="Model name (e.g., gpt-4o-mini, gpt-4, gpt-3.5-turbo)",
|
| 773 |
-
visible=False, # Hidden by default
|
| 774 |
)
|
| 775 |
|
| 776 |
gr.Markdown("---")
|
| 777 |
|
| 778 |
gr.Markdown("## 🔊 Text-to-Speech (TTS)")
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
)
|
| 783 |
-
else:
|
| 784 |
-
gr.Markdown(
|
| 785 |
-
"Choose your TTS provider for audio generation"
|
| 786 |
-
)
|
| 787 |
|
| 788 |
with gr.Group():
|
| 789 |
-
tts_provider_choice = gr.Radio(
|
| 790 |
-
choices=[
|
| 791 |
-
"Edge-TTS (Free, Microsoft)",
|
| 792 |
-
"ElevenLabs (Paid, Better Quality)",
|
| 793 |
-
],
|
| 794 |
-
value="Edge-TTS (Free, Microsoft)",
|
| 795 |
-
label="TTS Provider",
|
| 796 |
-
info="Edge-TTS is free and works without API key. ElevenLabs offers better voice quality.",
|
| 797 |
-
visible=not DEMO_MODE,
|
| 798 |
-
)
|
| 799 |
-
|
| 800 |
elevenlabs_key_input = gr.Textbox(
|
| 801 |
label="ElevenLabs API Key",
|
| 802 |
-
placeholder="sk_...
|
| 803 |
type="password",
|
| 804 |
-
info="Get your key at: elevenlabs.io",
|
| 805 |
-
|
|
|
|
| 806 |
)
|
| 807 |
|
| 808 |
gr.Markdown("### 🎭 Voice Selection")
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
choices=list(EDGE_TTS_VOICES.keys()),
|
| 816 |
-
value="Guy (US Male - Casual)",
|
| 817 |
-
label="Host Voice (Edge-TTS)",
|
| 818 |
info="Select voice for the podcast host",
|
| 819 |
)
|
| 820 |
-
|
| 821 |
-
choices=list(
|
| 822 |
-
value="
|
| 823 |
-
label="Guest Voice
|
| 824 |
info="Select voice for the expert guest",
|
| 825 |
)
|
| 826 |
|
| 827 |
-
#
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
}
|
| 860 |
-
elif choice == "OpenAI":
|
| 861 |
-
return {
|
| 862 |
-
own_inference_base_url: gr.update(
|
| 863 |
-
visible=False
|
| 864 |
-
),
|
| 865 |
-
own_inference_api_key: gr.update(visible=False),
|
| 866 |
-
own_inference_model: gr.update(visible=False),
|
| 867 |
-
openai_key_input: gr.update(visible=True),
|
| 868 |
-
openai_model_input: gr.update(visible=True),
|
| 869 |
-
}
|
| 870 |
-
|
| 871 |
-
llm_choice.change(
|
| 872 |
-
fn=toggle_llm_inputs,
|
| 873 |
-
inputs=[llm_choice],
|
| 874 |
-
outputs=[
|
| 875 |
-
own_inference_base_url,
|
| 876 |
-
own_inference_api_key,
|
| 877 |
-
own_inference_model,
|
| 878 |
-
openai_key_input,
|
| 879 |
-
openai_model_input,
|
| 880 |
-
],
|
| 881 |
-
)
|
| 882 |
-
|
| 883 |
-
# Toggle visibility based on TTS provider choice
|
| 884 |
-
def toggle_tts_inputs(choice):
|
| 885 |
-
if choice == "Edge-TTS (Free, Microsoft)":
|
| 886 |
-
return {
|
| 887 |
-
elevenlabs_key_input: gr.update(visible=False),
|
| 888 |
-
edge_voice_group: gr.update(visible=True),
|
| 889 |
-
elevenlabs_voice_group: gr.update(visible=False),
|
| 890 |
-
}
|
| 891 |
-
else: # ElevenLabs
|
| 892 |
-
return {
|
| 893 |
-
elevenlabs_key_input: gr.update(visible=True),
|
| 894 |
-
edge_voice_group: gr.update(visible=False),
|
| 895 |
-
elevenlabs_voice_group: gr.update(visible=True),
|
| 896 |
-
}
|
| 897 |
-
|
| 898 |
-
tts_provider_choice.change(
|
| 899 |
-
fn=toggle_tts_inputs,
|
| 900 |
-
inputs=[tts_provider_choice],
|
| 901 |
-
outputs=[elevenlabs_key_input, edge_voice_group, elevenlabs_voice_group],
|
| 902 |
-
)
|
| 903 |
|
| 904 |
gr.Markdown("---")
|
| 905 |
|
| 906 |
gr.Markdown("## 🎚️ Podcast Settings")
|
| 907 |
|
| 908 |
with gr.Group():
|
| 909 |
-
podcast_length = gr.
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
)
|
| 917 |
|
| 918 |
gr.Markdown("---")
|
|
@@ -929,38 +859,27 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 929 |
own_model,
|
| 930 |
openai_key,
|
| 931 |
openai_model,
|
| 932 |
-
tts_provider,
|
| 933 |
elevenlabs_key,
|
| 934 |
-
edge_host,
|
| 935 |
-
edge_guest,
|
| 936 |
elevenlabs_host,
|
| 937 |
elevenlabs_guest,
|
| 938 |
length,
|
| 939 |
):
|
| 940 |
status = "✅ **Settings Saved!**\n\n"
|
| 941 |
|
| 942 |
-
#
|
| 943 |
-
if tts_provider == "Edge-TTS (Free, Microsoft)":
|
| 944 |
-
tts_provider_internal = "edge-tts"
|
| 945 |
-
else:
|
| 946 |
-
tts_provider_internal = "elevenlabs"
|
| 947 |
-
|
| 948 |
-
# Validate settings first (only in non-demo mode)
|
| 949 |
is_valid, validation_message = (
|
| 950 |
validate_settings_for_generation(
|
| 951 |
llm_choice,
|
| 952 |
own_base_url,
|
| 953 |
own_api_key,
|
| 954 |
openai_key,
|
| 955 |
-
|
| 956 |
elevenlabs_key,
|
| 957 |
)
|
| 958 |
)
|
| 959 |
|
| 960 |
# LLM Settings
|
| 961 |
-
if
|
| 962 |
-
status += "- LLM: Demo Inference ✓\n"
|
| 963 |
-
elif llm_choice == "Own Inference":
|
| 964 |
if own_base_url:
|
| 965 |
status += f"- LLM: Own Inference ✓\n"
|
| 966 |
status += f" - URL: {own_base_url[:50]}...\n"
|
|
@@ -974,114 +893,78 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 974 |
status += "- ⚠️ LLM: OpenAI selected but no API key provided\n"
|
| 975 |
|
| 976 |
# TTS Settings
|
| 977 |
-
if
|
| 978 |
-
status += "- TTS:
|
| 979 |
else:
|
| 980 |
-
|
| 981 |
-
status += "- TTS: Edge-TTS (Microsoft, free) ✓\n"
|
| 982 |
-
elif elevenlabs_key:
|
| 983 |
-
status += "- TTS: ElevenLabs (Custom key) ✓\n"
|
| 984 |
-
else:
|
| 985 |
-
status += "- ⚠️ TTS: ElevenLabs key required\n"
|
| 986 |
|
| 987 |
# Add validation result
|
| 988 |
-
if
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 996 |
|
| 997 |
-
status += f"\n- Podcast Length: {int(length)} tokens\n"
|
| 998 |
status += (
|
| 999 |
"\n*Settings will be used for next podcast generation.*"
|
| 1000 |
)
|
| 1001 |
|
| 1002 |
-
#
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
guest_voice = EDGE_TTS_VOICES.get(edge_guest, "en-US-JennyNeural")
|
| 1006 |
-
else: # elevenlabs
|
| 1007 |
-
host_voice = ELEVENLABS_VOICES.get(elevenlabs_host, "ErXwobaYiN019PkySvjV")
|
| 1008 |
-
guest_voice = ELEVENLABS_VOICES.get(elevenlabs_guest, "EXAVITQu4vr4xnSDxMaL")
|
| 1009 |
|
|
|
|
| 1010 |
return (
|
| 1011 |
status,
|
| 1012 |
-
llm_choice
|
| 1013 |
-
own_base_url
|
| 1014 |
-
own_api_key
|
| 1015 |
-
own_model
|
| 1016 |
openai_key,
|
| 1017 |
openai_model,
|
| 1018 |
-
|
| 1019 |
-
elevenlabs_key
|
| 1020 |
-
host_voice
|
| 1021 |
-
guest_voice
|
| 1022 |
-
|
| 1023 |
is_valid,
|
| 1024 |
)
|
| 1025 |
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
"edge-tts",
|
| 1041 |
-
"",
|
| 1042 |
-
host_voice,
|
| 1043 |
-
guest_voice,
|
| 1044 |
-
int(length),
|
| 1045 |
-
True, # settings_valid = True in demo mode
|
| 1046 |
-
)
|
| 1047 |
-
|
| 1048 |
-
save_settings_btn.click(
|
| 1049 |
-
fn=save_demo_settings,
|
| 1050 |
-
inputs=[edge_host_voice, edge_guest_voice, podcast_length],
|
| 1051 |
-
outputs=[
|
| 1052 |
-
settings_status,
|
| 1053 |
-
user_llm_choice,
|
| 1054 |
-
user_own_base_url,
|
| 1055 |
-
user_own_api_key,
|
| 1056 |
-
user_own_model,
|
| 1057 |
-
user_openai_key,
|
| 1058 |
-
user_openai_model,
|
| 1059 |
-
user_tts_provider,
|
| 1060 |
-
user_elevenlabs_key,
|
| 1061 |
-
user_host_voice,
|
| 1062 |
-
user_guest_voice,
|
| 1063 |
-
user_podcast_length,
|
| 1064 |
-
settings_valid,
|
| 1065 |
-
],
|
| 1066 |
-
)
|
| 1067 |
-
else:
|
| 1068 |
-
save_settings_btn.click(
|
| 1069 |
-
fn=save_settings,
|
| 1070 |
-
inputs=[
|
| 1071 |
-
llm_choice,
|
| 1072 |
-
own_inference_base_url,
|
| 1073 |
-
own_inference_api_key,
|
| 1074 |
-
own_inference_model,
|
| 1075 |
-
openai_key_input,
|
| 1076 |
-
openai_model_input,
|
| 1077 |
-
tts_provider_choice,
|
| 1078 |
-
elevenlabs_key_input,
|
| 1079 |
-
edge_host_voice,
|
| 1080 |
-
edge_guest_voice,
|
| 1081 |
-
elevenlabs_host_voice,
|
| 1082 |
-
elevenlabs_guest_voice,
|
| 1083 |
-
podcast_length,
|
| 1084 |
-
],
|
| 1085 |
outputs=[
|
| 1086 |
settings_status,
|
| 1087 |
user_llm_choice,
|
|
@@ -1113,73 +996,149 @@ Configure your PaperCast experience with your own API keys and preferences.
|
|
| 1113 |
|
| 1114 |
# About PaperCast
|
| 1115 |
|
| 1116 |
-
**
|
| 1117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1118 |
|
| 1119 |
---
|
| 1120 |
|
| 1121 |
## 🎯 How It Works
|
| 1122 |
|
| 1123 |
-
Our intelligent agent orchestrates a multi-step pipeline
|
| 1124 |
|
| 1125 |
-
1. **📥 Input** -
|
| 1126 |
-
2. **📄 Extraction** -
|
| 1127 |
-
3. **🎬 Script Generation** -
|
| 1128 |
-
4. **🎤 Voice Synthesis** -
|
| 1129 |
-
5. **✅ Delivery** -
|
| 1130 |
|
| 1131 |
---
|
| 1132 |
|
| 1133 |
## 🌟 Key Features
|
| 1134 |
|
| 1135 |
-
**
|
| 1136 |
-
**
|
| 1137 |
-
**
|
| 1138 |
-
**
|
| 1139 |
-
**
|
|
|
|
|
|
|
| 1140 |
|
| 1141 |
---
|
| 1142 |
|
| 1143 |
## 🔧 Technology Stack
|
| 1144 |
|
| 1145 |
**LLM**: {SCRIPT_GENERATION_MODEL}
|
| 1146 |
-
**TTS**:
|
|
|
|
|
|
|
| 1147 |
**Infrastructure**: ☁️ Remote Inference
|
| 1148 |
-
**Framework**: Gradio 6
|
| 1149 |
-
**
|
| 1150 |
|
| 1151 |
---
|
| 1152 |
|
| 1153 |
## 🎓 Built For
|
| 1154 |
|
| 1155 |
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
|
|
|
|
| 1156 |
|
| 1157 |
-
This project demonstrates
|
| 1158 |
-
|
|
|
|
|
|
|
|
|
|
| 1159 |
|
| 1160 |
---
|
| 1161 |
|
| 1162 |
## 📝 About the Agent
|
| 1163 |
|
| 1164 |
-
PaperCast
|
| 1165 |
|
| 1166 |
-
**Plans**
|
| 1167 |
-
**Reasons**
|
| 1168 |
-
**Executes**
|
| 1169 |
-
**Adapts** dialogue
|
|
|
|
| 1170 |
|
| 1171 |
---
|
| 1172 |
|
| 1173 |
## 💡 Use Cases
|
| 1174 |
|
| 1175 |
-
🎧
|
| 1176 |
-
📚 Quick
|
| 1177 |
-
🌍 Make research
|
| 1178 |
-
🔬 Stay
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1179 |
|
| 1180 |
---
|
| 1181 |
|
| 1182 |
-
Made with ❤️ using
|
| 1183 |
|
| 1184 |
</div>
|
| 1185 |
""")
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
from agents.podcast_agent import PodcastAgent
|
| 7 |
+
from synthesis.tts_engine import ELEVENLABS_VOICES
|
| 8 |
from utils.config import (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
OUTPUT_DIR,
|
| 10 |
SCRIPT_GENERATION_MODEL,
|
| 11 |
)
|
|
|
|
| 15 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 16 |
|
| 17 |
|
| 18 |
+
# Podcast length presets: maps UI choice to (target_exchanges, max_tokens)
|
| 19 |
+
PODCAST_LENGTH_PRESETS = {
|
| 20 |
+
"⚡ Very Short (6-8 exchanges, ~2-3 min)": (7, 2000),
|
| 21 |
+
"📝 Short (10-12 exchanges, ~3-4 min)": (11, 3000),
|
| 22 |
+
"📄 Medium (14-16 exchanges, ~5-6 min)": (15, 4000),
|
| 23 |
+
"📚 Medium-Long (18-20 exchanges, ~7-8 min)": (19, 5000),
|
| 24 |
+
"📖 Long (22-25 exchanges, ~9-11 min)": (23, 6000),
|
| 25 |
+
"📕 Very Long (28-32 exchanges, ~12-15 min)": (30, 8000),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_podcast_length_params(length_choice):
|
| 30 |
+
"""
|
| 31 |
+
Convert podcast length choice to parameters.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
tuple: (target_dialogue_count, max_tokens)
|
| 35 |
+
"""
|
| 36 |
+
return PODCAST_LENGTH_PRESETS.get(
|
| 37 |
+
length_choice,
|
| 38 |
+
(15, 4000) # Default to Medium
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
def validate_settings_for_generation(
|
| 43 |
llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key
|
| 44 |
):
|
| 45 |
"""
|
| 46 |
+
Validate user settings for podcast generation (BYOK - Bring Your Own Key).
|
| 47 |
|
| 48 |
Returns:
|
| 49 |
tuple: (is_valid, error_message)
|
| 50 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
errors = []
|
| 52 |
|
| 53 |
# Validate LLM settings
|
|
|
|
| 67 |
elif not openai_key.startswith("sk-"):
|
| 68 |
errors.append("❌ **OpenAI**: API key must start with 'sk-'")
|
| 69 |
|
| 70 |
+
# Validate TTS settings (ElevenLabs required)
|
| 71 |
+
if not elevenlabs_key:
|
| 72 |
+
errors.append("❌ **ElevenLabs TTS**: API key is required")
|
| 73 |
+
elif not elevenlabs_key.startswith("sk_"):
|
| 74 |
+
errors.append("❌ **ElevenLabs TTS**: API key must start with 'sk_'")
|
|
|
|
|
|
|
| 75 |
|
| 76 |
if errors:
|
| 77 |
return False, "\n".join(errors)
|
|
|
|
| 202 |
user_podcast_length,
|
| 203 |
progress=gr.Progress(),
|
| 204 |
):
|
| 205 |
+
"""Run podcast generation with user settings (BYOK)"""
|
| 206 |
|
| 207 |
# Determine provider mode
|
| 208 |
+
if user_llm_choice == "Own Inference":
|
|
|
|
|
|
|
| 209 |
provider_mode = "own_inference"
|
| 210 |
else: # OpenAI
|
| 211 |
provider_mode = "openai"
|
| 212 |
|
| 213 |
+
# Parse podcast length settings
|
| 214 |
+
target_exchanges, max_tokens = get_podcast_length_params(user_podcast_length)
|
| 215 |
+
|
| 216 |
agent = PodcastAgent(
|
| 217 |
provider_mode=provider_mode,
|
| 218 |
own_base_url=user_own_base_url if user_own_base_url else None,
|
|
|
|
| 220 |
own_model=user_own_model if user_own_model else None,
|
| 221 |
openai_key=user_openai_key if user_openai_key else None,
|
| 222 |
openai_model=user_openai_model if user_openai_model else None,
|
| 223 |
+
tts_provider=user_tts_provider if user_tts_provider else "elevenlabs",
|
| 224 |
elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
|
| 225 |
host_voice=user_host_voice if user_host_voice else None,
|
| 226 |
guest_voice=user_guest_voice if user_guest_voice else None,
|
| 227 |
+
max_tokens=max_tokens,
|
| 228 |
+
target_dialogue_count=target_exchanges,
|
| 229 |
)
|
| 230 |
logs_history = ""
|
| 231 |
|
| 232 |
# Log settings being used
|
| 233 |
settings_log = "Settings: "
|
| 234 |
+
if provider_mode == "own_inference":
|
|
|
|
|
|
|
| 235 |
settings_log += f"LLM: Own Inference | "
|
| 236 |
+
settings_log += "TTS: ElevenLabs | "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
else: # openai
|
| 238 |
settings_log += f"LLM: OpenAI ({user_openai_model or 'gpt-4o-mini'}) | "
|
| 239 |
+
settings_log += "TTS: ElevenLabs | "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
settings_log += (
|
| 242 |
f"Length: {user_podcast_length if user_podcast_length else 4096} tokens"
|
|
|
|
| 388 |
)
|
| 389 |
|
| 390 |
with gr.Blocks(title="PaperCast", theme=theme) as demo:
|
| 391 |
+
# Session state for settings (BYOK - Bring Your Own Key)
|
| 392 |
+
# NOTE: Settings are session-only for security (multi-user HF Spaces)
|
| 393 |
+
user_llm_choice = gr.State(value="Own Inference")
|
| 394 |
+
user_own_base_url = gr.State(value="")
|
| 395 |
+
user_own_api_key = gr.State(value="")
|
| 396 |
+
user_own_model = gr.State(value="")
|
| 397 |
+
user_openai_key = gr.State(value="")
|
| 398 |
+
user_openai_model = gr.State(value="")
|
| 399 |
+
user_tts_provider = gr.State(value="elevenlabs")
|
| 400 |
+
user_elevenlabs_key = gr.State(value="")
|
| 401 |
+
user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # Antoni
|
| 402 |
+
user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # Bella
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
user_podcast_length = gr.State(value=4096)
|
| 404 |
+
settings_valid = gr.State(value=False) # Users must configure settings
|
| 405 |
|
| 406 |
+
# Initialize generate button state
|
| 407 |
+
generate_btn_state = gr.State(value=False)
|
| 408 |
|
| 409 |
with gr.Row():
|
| 410 |
gr.HTML("""
|
|
|
|
| 710 |
)
|
| 711 |
|
| 712 |
with gr.Group():
|
| 713 |
+
llm_choice = gr.Radio(
|
| 714 |
+
choices=[
|
| 715 |
+
"Own Inference",
|
| 716 |
+
"OpenAI",
|
| 717 |
+
],
|
| 718 |
+
value="Own Inference",
|
| 719 |
+
label="Language Model Provider",
|
| 720 |
+
info="Choose your language model provider for script generation",
|
| 721 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
# Own Inference inputs (base URL + API key)
|
| 724 |
own_inference_base_url = gr.Textbox(
|
| 725 |
label="Base URL",
|
| 726 |
placeholder="https://your-server.com/v1",
|
| 727 |
info="OpenAI-compatible endpoint",
|
| 728 |
+
visible=True,
|
| 729 |
)
|
| 730 |
|
| 731 |
own_inference_api_key = gr.Textbox(
|
|
|
|
| 733 |
placeholder="Optional - leave empty if not required",
|
| 734 |
type="password",
|
| 735 |
info="API key for your inference server (if required)",
|
| 736 |
+
visible=True,
|
| 737 |
)
|
| 738 |
|
| 739 |
own_inference_model = gr.Textbox(
|
| 740 |
label="Model Name",
|
| 741 |
placeholder="e.g., llama-3.1-8b, mistral-7b",
|
| 742 |
info="Model name on your server",
|
| 743 |
+
visible=True,
|
| 744 |
)
|
| 745 |
|
| 746 |
# OpenAI inputs
|
|
|
|
| 749 |
placeholder="sk-...",
|
| 750 |
type="password",
|
| 751 |
info="Required when using OpenAI",
|
| 752 |
+
visible=False, # Hidden by default
|
| 753 |
)
|
| 754 |
|
| 755 |
openai_model_input = gr.Textbox(
|
|
|
|
| 757 |
placeholder="gpt-4o-mini",
|
| 758 |
value="gpt-4o-mini",
|
| 759 |
info="Model name (e.g., gpt-4o-mini, gpt-4, gpt-3.5-turbo)",
|
| 760 |
+
visible=False, # Hidden by default
|
| 761 |
)
|
| 762 |
|
| 763 |
gr.Markdown("---")
|
| 764 |
|
| 765 |
gr.Markdown("## 🔊 Text-to-Speech (TTS)")
|
| 766 |
+
gr.Markdown(
|
| 767 |
+
"Powered by ElevenLabs - Premium AI voice synthesis"
|
| 768 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 769 |
|
| 770 |
with gr.Group():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
elevenlabs_key_input = gr.Textbox(
|
| 772 |
label="ElevenLabs API Key",
|
| 773 |
+
placeholder="sk_...",
|
| 774 |
type="password",
|
| 775 |
+
info="Get your key at: elevenlabs.io (Required)",
|
| 776 |
+
value="",
|
| 777 |
+
visible=True,
|
| 778 |
)
|
| 779 |
|
| 780 |
gr.Markdown("### 🎭 Voice Selection")
|
| 781 |
+
|
| 782 |
+
with gr.Group(visible=True) as elevenlabs_voice_group:
|
| 783 |
+
elevenlabs_host_voice = gr.Dropdown(
|
| 784 |
+
choices=list(ELEVENLABS_VOICES.keys()),
|
| 785 |
+
value="Antoni (Male - Well-rounded)",
|
| 786 |
+
label="Host Voice",
|
|
|
|
|
|
|
|
|
|
| 787 |
info="Select voice for the podcast host",
|
| 788 |
)
|
| 789 |
+
elevenlabs_guest_voice = gr.Dropdown(
|
| 790 |
+
choices=list(ELEVENLABS_VOICES.keys()),
|
| 791 |
+
value="Bella (Female - Soft)",
|
| 792 |
+
label="Guest Voice",
|
| 793 |
info="Select voice for the expert guest",
|
| 794 |
)
|
| 795 |
|
| 796 |
+
# Toggle visibility based on LLM choice
|
| 797 |
+
def toggle_llm_inputs(choice):
|
| 798 |
+
if choice == "Own Inference":
|
| 799 |
+
return {
|
| 800 |
+
own_inference_base_url: gr.update(visible=True),
|
| 801 |
+
own_inference_api_key: gr.update(visible=True),
|
| 802 |
+
own_inference_model: gr.update(visible=True),
|
| 803 |
+
openai_key_input: gr.update(visible=False),
|
| 804 |
+
openai_model_input: gr.update(visible=False),
|
| 805 |
+
}
|
| 806 |
+
elif choice == "OpenAI":
|
| 807 |
+
return {
|
| 808 |
+
own_inference_base_url: gr.update(
|
| 809 |
+
visible=False
|
| 810 |
+
),
|
| 811 |
+
own_inference_api_key: gr.update(visible=False),
|
| 812 |
+
own_inference_model: gr.update(visible=False),
|
| 813 |
+
openai_key_input: gr.update(visible=True),
|
| 814 |
+
openai_model_input: gr.update(visible=True),
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
llm_choice.change(
|
| 818 |
+
fn=toggle_llm_inputs,
|
| 819 |
+
inputs=[llm_choice],
|
| 820 |
+
outputs=[
|
| 821 |
+
own_inference_base_url,
|
| 822 |
+
own_inference_api_key,
|
| 823 |
+
own_inference_model,
|
| 824 |
+
openai_key_input,
|
| 825 |
+
openai_model_input,
|
| 826 |
+
],
|
| 827 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
|
| 829 |
gr.Markdown("---")
|
| 830 |
|
| 831 |
gr.Markdown("## 🎚️ Podcast Settings")
|
| 832 |
|
| 833 |
with gr.Group():
|
| 834 |
+
podcast_length = gr.Radio(
|
| 835 |
+
choices=[
|
| 836 |
+
"⚡ Very Short (6-8 exchanges, ~2-3 min)",
|
| 837 |
+
"📝 Short (10-12 exchanges, ~3-4 min)",
|
| 838 |
+
"📄 Medium (14-16 exchanges, ~5-6 min)",
|
| 839 |
+
"📚 Medium-Long (18-20 exchanges, ~7-8 min)",
|
| 840 |
+
"📖 Long (22-25 exchanges, ~9-11 min)",
|
| 841 |
+
"📕 Very Long (28-32 exchanges, ~12-15 min)",
|
| 842 |
+
],
|
| 843 |
+
value="📄 Medium (14-16 exchanges, ~5-6 min)",
|
| 844 |
+
label="Podcast Length",
|
| 845 |
+
info="Select desired podcast duration based on dialogue exchanges",
|
| 846 |
)
|
| 847 |
|
| 848 |
gr.Markdown("---")
|
|
|
|
| 859 |
own_model,
|
| 860 |
openai_key,
|
| 861 |
openai_model,
|
|
|
|
| 862 |
elevenlabs_key,
|
|
|
|
|
|
|
| 863 |
elevenlabs_host,
|
| 864 |
elevenlabs_guest,
|
| 865 |
length,
|
| 866 |
):
|
| 867 |
status = "✅ **Settings Saved!**\n\n"
|
| 868 |
|
| 869 |
+
# Validate settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 870 |
is_valid, validation_message = (
|
| 871 |
validate_settings_for_generation(
|
| 872 |
llm_choice,
|
| 873 |
own_base_url,
|
| 874 |
own_api_key,
|
| 875 |
openai_key,
|
| 876 |
+
"elevenlabs", # Always ElevenLabs
|
| 877 |
elevenlabs_key,
|
| 878 |
)
|
| 879 |
)
|
| 880 |
|
| 881 |
# LLM Settings
|
| 882 |
+
if llm_choice == "Own Inference":
|
|
|
|
|
|
|
| 883 |
if own_base_url:
|
| 884 |
status += f"- LLM: Own Inference ✓\n"
|
| 885 |
status += f" - URL: {own_base_url[:50]}...\n"
|
|
|
|
| 893 |
status += "- ⚠️ LLM: OpenAI selected but no API key provided\n"
|
| 894 |
|
| 895 |
# TTS Settings
|
| 896 |
+
if elevenlabs_key:
|
| 897 |
+
status += "- TTS: ElevenLabs ✓\n"
|
| 898 |
else:
|
| 899 |
+
status += "- ⚠️ TTS: ElevenLabs API key required\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
|
| 901 |
# Add validation result
|
| 902 |
+
if is_valid:
|
| 903 |
+
status += "\n✅ **All settings are valid!**\n"
|
| 904 |
+
status += "🎉 Generate button is now enabled.\n"
|
| 905 |
+
else:
|
| 906 |
+
status += "\n⚠️ **Settings incomplete!**\n"
|
| 907 |
+
status += "🚫 Generate button remains disabled.\n"
|
| 908 |
+
status += f"\nRequired fixes:\n{validation_message}"
|
| 909 |
+
|
| 910 |
+
# Parse podcast length
|
| 911 |
+
target_exchanges, max_tokens = get_podcast_length_params(length)
|
| 912 |
+
status += f"\n- Podcast Length: {length}\n"
|
| 913 |
+
status += f" - Target: {target_exchanges} dialogue exchanges\n"
|
| 914 |
+
status += f" - Max tokens: {max_tokens}\n"
|
| 915 |
+
|
| 916 |
+
# Add reasoning model info if using OpenAI reasoning models
|
| 917 |
+
if llm_choice == "OpenAI" and openai_model:
|
| 918 |
+
model_lower = openai_model.lower()
|
| 919 |
+
# Check if it's a reasoning model
|
| 920 |
+
is_reasoning = any(
|
| 921 |
+
keyword in model_lower
|
| 922 |
+
for keyword in ["gpt-5", "o1", "o3", "o4"]
|
| 923 |
+
) and "chat" not in model_lower
|
| 924 |
+
|
| 925 |
+
if is_reasoning:
|
| 926 |
+
total_tokens = max_tokens * 2
|
| 927 |
+
status += f" - ⚡ Reasoning model: {max_tokens} × 2 = {total_tokens} max tokens\n"
|
| 928 |
|
|
|
|
| 929 |
status += (
|
| 930 |
"\n*Settings will be used for next podcast generation.*"
|
| 931 |
)
|
| 932 |
|
| 933 |
+
# Get ElevenLabs voices
|
| 934 |
+
host_voice = ELEVENLABS_VOICES.get(elevenlabs_host, "ErXwobaYiN019PkySvjV")
|
| 935 |
+
guest_voice = ELEVENLABS_VOICES.get(elevenlabs_guest, "EXAVITQu4vr4xnSDxMaL")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
|
| 937 |
+
# Settings are stored in gr.State() for session-only (no disk persistence for security)
|
| 938 |
return (
|
| 939 |
status,
|
| 940 |
+
llm_choice,
|
| 941 |
+
own_base_url,
|
| 942 |
+
own_api_key,
|
| 943 |
+
own_model,
|
| 944 |
openai_key,
|
| 945 |
openai_model,
|
| 946 |
+
"elevenlabs", # Always ElevenLabs
|
| 947 |
+
elevenlabs_key,
|
| 948 |
+
host_voice,
|
| 949 |
+
guest_voice,
|
| 950 |
+
length, # Now stores the full choice string
|
| 951 |
is_valid,
|
| 952 |
)
|
| 953 |
|
| 954 |
+
save_settings_btn.click(
|
| 955 |
+
fn=save_settings,
|
| 956 |
+
inputs=[
|
| 957 |
+
llm_choice,
|
| 958 |
+
own_inference_base_url,
|
| 959 |
+
own_inference_api_key,
|
| 960 |
+
own_inference_model,
|
| 961 |
+
openai_key_input,
|
| 962 |
+
openai_model_input,
|
| 963 |
+
elevenlabs_key_input,
|
| 964 |
+
elevenlabs_host_voice,
|
| 965 |
+
elevenlabs_guest_voice,
|
| 966 |
+
podcast_length,
|
| 967 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
outputs=[
|
| 969 |
settings_status,
|
| 970 |
user_llm_choice,
|
|
|
|
| 996 |
|
| 997 |
# About PaperCast
|
| 998 |
|
| 999 |
+
**The world's first interactive, multi-modal, AI-powered academic podcast studio.**
|
| 1000 |
+
|
| 1001 |
+
Transform any research paper into engaging audio conversations with our proprietary frameworks powered by MCP tools, Gradio 6, and ElevenLabs.
|
| 1002 |
+
|
| 1003 |
+
---
|
| 1004 |
+
|
| 1005 |
+
## 🚀 Revolutionary Frameworks
|
| 1006 |
+
|
| 1007 |
+
We built 4 original frameworks that redefine how people consume research:
|
| 1008 |
+
|
| 1009 |
+
### **PPF** — Podcast Persona Framework
|
| 1010 |
+
Choose from 5 unique conversation styles:
|
| 1011 |
+
- 🤝 **Friendly Explainer** - Casual discussion between friends (default)
|
| 1012 |
+
- ⚔️ **Academic Debate** - Constructive challenges and defenses
|
| 1013 |
+
- 🔥 **Savage Roast** - Brutal critique meets stubborn defense
|
| 1014 |
+
- 🎓 **Pedagogical** - Professor teaching curious student
|
| 1015 |
+
- 🌐 **Interdisciplinary Clash** - Domain expert vs. complete outsider
|
| 1016 |
+
|
| 1017 |
+
### **PAD** — Paper Auto-Discovery
|
| 1018 |
+
- 🔍 Natural language search: "diffusion survey 2025" or "Grok reasoning"
|
| 1019 |
+
- 📚 Semantic Scholar + arXiv API integration
|
| 1020 |
+
- 🎯 Zero-friction paper discovery
|
| 1021 |
+
|
| 1022 |
+
### **PVF** — Paper Visual Framework *(Coming Soon)*
|
| 1023 |
+
- 📄 Synchronized PDF viewer with audio playback
|
| 1024 |
+
- 🎯 Auto-scroll to figures/tables when mentioned
|
| 1025 |
+
- ⏱️ Clickable timestamps in transcript
|
| 1026 |
+
|
| 1027 |
+
### **CPM** — Counterfactual Paper Mode *(Coming Soon)*
|
| 1028 |
+
- 🤔 "What if this paper was written by Yann LeCun?"
|
| 1029 |
+
- 🕰️ "What if GPT-4 never existed?"
|
| 1030 |
+
- 🌀 Alternate reality interpretations
|
| 1031 |
|
| 1032 |
---
|
| 1033 |
|
| 1034 |
## 🎯 How It Works
|
| 1035 |
|
| 1036 |
+
Our intelligent agent orchestrates a multi-step pipeline:
|
| 1037 |
|
| 1038 |
+
1. **📥 Input** - URL, PDF upload, or free-text search
|
| 1039 |
+
2. **📄 Extraction** - Marker-pdf MCP extracts clean markdown with LaTeX
|
| 1040 |
+
3. **🎬 Script Generation** - Claude creates persona-aware dialogue
|
| 1041 |
+
4. **🎤 Voice Synthesis** - ElevenLabs generates premium audio
|
| 1042 |
+
5. **✅ Delivery** - Listen, download, share
|
| 1043 |
|
| 1044 |
---
|
| 1045 |
|
| 1046 |
## 🌟 Key Features
|
| 1047 |
|
| 1048 |
+
✨ **5 Persona Modes** - From friendly to savage
|
| 1049 |
+
🔍 **Smart Paper Search** - Semantic Scholar + arXiv
|
| 1050 |
+
🎙️ **Premium Audio** - ElevenLabs TTS (required)
|
| 1051 |
+
📝 **Multi-format Export** - TXT, SRT, VTT, DOCX *(coming)*
|
| 1052 |
+
🧠 **Agent Intelligence** - MCP-powered autonomous reasoning
|
| 1053 |
+
📚 **History Tracking** - All podcasts saved locally
|
| 1054 |
+
⚡ **Multi-paper Processing** - Batch generation support
|
| 1055 |
|
| 1056 |
---
|
| 1057 |
|
| 1058 |
## 🔧 Technology Stack
|
| 1059 |
|
| 1060 |
**LLM**: {SCRIPT_GENERATION_MODEL}
|
| 1061 |
+
**TTS**: ElevenLabs (Premium AI Voice Synthesis)
|
| 1062 |
+
**PDF Processing**: Marker-pdf MCP Server
|
| 1063 |
+
**Search**: Semantic Scholar Graph API + arXiv API
|
| 1064 |
**Infrastructure**: ☁️ Remote Inference
|
| 1065 |
+
**Framework**: Gradio 6 with MCP Integration
|
| 1066 |
+
**Agent**: Claude with Model Context Protocol
|
| 1067 |
|
| 1068 |
---
|
| 1069 |
|
| 1070 |
## 🎓 Built For
|
| 1071 |
|
| 1072 |
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
|
| 1073 |
+
*Tag: `mcp-in-action-track-consumer`*
|
| 1074 |
|
| 1075 |
+
This project demonstrates:
|
| 1076 |
+
- 🤖 Autonomous agent planning and reasoning
|
| 1077 |
+
- 🔧 MCP tools as cognitive extensions
|
| 1078 |
+
- 🎨 Innovative UI/UX with Gradio 6
|
| 1079 |
+
- 🚀 Real-world impact on research accessibility
|
| 1080 |
|
| 1081 |
---
|
| 1082 |
|
| 1083 |
## 📝 About the Agent
|
| 1084 |
|
| 1085 |
+
PaperCast's autonomous agent:
|
| 1086 |
|
| 1087 |
+
- **Plans** - Analyzes paper structure and selects optimal conversation strategy
|
| 1088 |
+
- **Reasons** - Determines which concepts need simplification based on persona
|
| 1089 |
+
- **Executes** - Orchestrates MCP tools (Marker, Semantic Scholar, arXiv)
|
| 1090 |
+
- **Adapts** - Adjusts dialogue complexity and style per persona mode
|
| 1091 |
+
- **Discovers** - Intelligently searches and retrieves relevant papers
|
| 1092 |
|
| 1093 |
---
|
| 1094 |
|
| 1095 |
## 💡 Use Cases
|
| 1096 |
|
| 1097 |
+
🎧 **Commute Learning** - Listen during travel or exercise
|
| 1098 |
+
📚 **Quick Previews** - Overview before deep reading
|
| 1099 |
+
🌍 **Accessibility** - Make research understandable for everyone
|
| 1100 |
+
🔬 **Stay Current** - Keep up with latest papers effortlessly
|
| 1101 |
+
🎭 **Entertainment** - Savage Roast mode for fun paper critiques
|
| 1102 |
+
🤔 **What-If Scenarios** - Explore counterfactual interpretations
|
| 1103 |
+
|
| 1104 |
+
---
|
| 1105 |
+
|
| 1106 |
+
## 🏆 What Makes Us Different
|
| 1107 |
+
|
| 1108 |
+
**Not just another summarizer** - We invented the Podcast Persona Framework (PPF)
|
| 1109 |
+
**Visual sync** - Paper Visual Framework (PVF) connects audio to figures
|
| 1110 |
+
**Smart discovery** - Paper Auto-Discovery (PAD) finds papers via natural language
|
| 1111 |
+
**Counterfactuals** - Counterfactual Paper Mode (CPM) explores alternate realities
|
| 1112 |
+
**MCP Native** - Built from ground up with Model Context Protocol
|
| 1113 |
+
|
| 1114 |
+
---
|
| 1115 |
+
|
| 1116 |
+
## 🙏 Special Thanks
|
| 1117 |
+
|
| 1118 |
+
This project was made possible by the incredible support from:
|
| 1119 |
+
|
| 1120 |
+
<div style="display: flex; justify-content: center; align-items: center; gap: 80px; margin: 50px 0; flex-wrap: wrap;">
|
| 1121 |
+
<div style="text-align: center;">
|
| 1122 |
+
<a href="https://modal.com" target="_blank">
|
| 1123 |
+
<img src="https://images.prismic.io/contrary-research/aDnorSdWJ-7kSv6V_ModalLabs_Cover.png?auto=format,compress" alt="Modal" style="height: 140px; width: auto; display: block; margin: 0 auto;">
|
| 1124 |
+
</a>
|
| 1125 |
+
</div>
|
| 1126 |
+
<div style="text-align: center;">
|
| 1127 |
+
<a href="https://elevenlabs.io" target="_blank">
|
| 1128 |
+
<img src="https://eleven-public-cdn.elevenlabs.io/payloadcms/9trrmnj2sj8-logo-logo.svg" alt="ElevenLabs" style="height: 100px; width: auto; display: block; margin: 0 auto;">
|
| 1129 |
+
</a>
|
| 1130 |
+
</div>
|
| 1131 |
+
</div>
|
| 1132 |
+
|
| 1133 |
+
**Why we chose these partners:**
|
| 1134 |
+
|
| 1135 |
+
🚀 **Modal** - Serverless AI infrastructure that gives us instant access to powerful GPUs (A100, H100) with sub-second cold starts. Their platform handles automatic scaling, letting us process papers efficiently without managing infrastructure. Perfect for variable workloads and rapid iteration.
|
| 1136 |
+
|
| 1137 |
+
🎙️ **ElevenLabs** - We use their **Turbo v2.5** model for studio-quality voice synthesis. This model delivers incredibly natural, emotionally expressive voices with low latency (~250-300ms) and 50% lower cost. The voice quality makes our podcasts truly engaging and professional.
|
| 1138 |
|
| 1139 |
---
|
| 1140 |
|
| 1141 |
+
Made with ❤️ using Anthropic, OpenAI, Modal, ElevenLabs, Gradio, and MCP
|
| 1142 |
|
| 1143 |
</div>
|
| 1144 |
""")
|
generation/script_generator.py
CHANGED
|
@@ -1,14 +1,9 @@
|
|
| 1 |
-
import base64
|
| 2 |
import json
|
| 3 |
|
| 4 |
import httpx
|
| 5 |
from openai import OpenAI
|
| 6 |
|
| 7 |
from utils.config import (
|
| 8 |
-
DEMO_INFERENCE_KEY,
|
| 9 |
-
DEMO_INFERENCE_URL,
|
| 10 |
-
DEMO_MODE,
|
| 11 |
-
DEMO_MODEL,
|
| 12 |
MAX_TOKENS,
|
| 13 |
SCRIPT_GENERATION_MODEL,
|
| 14 |
TEMPERATURE,
|
|
@@ -18,7 +13,7 @@ from utils.config import (
|
|
| 18 |
class ScriptGenerator:
|
| 19 |
def __init__(
|
| 20 |
self,
|
| 21 |
-
provider_mode="
|
| 22 |
own_base_url=None,
|
| 23 |
own_api_key=None,
|
| 24 |
own_model=None,
|
|
@@ -30,7 +25,7 @@ class ScriptGenerator:
|
|
| 30 |
Initialize ScriptGenerator with flexible provider support.
|
| 31 |
|
| 32 |
Args:
|
| 33 |
-
provider_mode: "
|
| 34 |
own_base_url: Base URL for own inference server
|
| 35 |
own_api_key: API key for own inference server
|
| 36 |
own_model: Model name for own inference server
|
|
@@ -41,20 +36,7 @@ class ScriptGenerator:
|
|
| 41 |
self.provider_mode = provider_mode
|
| 42 |
self.max_tokens = max_tokens or MAX_TOKENS
|
| 43 |
|
| 44 |
-
if provider_mode == "
|
| 45 |
-
# Demo mode - use hardcoded credentials
|
| 46 |
-
print(f"Using Demo Inference: {DEMO_INFERENCE_URL}")
|
| 47 |
-
username, password = DEMO_INFERENCE_KEY.split(":", 1)
|
| 48 |
-
http_client = httpx.Client(auth=(username, password))
|
| 49 |
-
self.client = OpenAI(
|
| 50 |
-
base_url=DEMO_INFERENCE_URL,
|
| 51 |
-
api_key="dummy",
|
| 52 |
-
http_client=http_client,
|
| 53 |
-
)
|
| 54 |
-
self.model_name = DEMO_MODEL
|
| 55 |
-
print("✓ Demo inference client initialized")
|
| 56 |
-
|
| 57 |
-
elif provider_mode == "own_inference":
|
| 58 |
# Own inference server
|
| 59 |
print(f"Connecting to own inference API: {own_base_url}")
|
| 60 |
|
|
@@ -94,18 +76,19 @@ class ScriptGenerator:
|
|
| 94 |
else:
|
| 95 |
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
| 96 |
|
| 97 |
-
def generate_podcast_script(self, paper_text: str) -> list:
|
| 98 |
"""
|
| 99 |
Generates a podcast script from the given paper text.
|
| 100 |
|
| 101 |
Args:
|
| 102 |
paper_text (str): The text content of the research paper.
|
|
|
|
| 103 |
|
| 104 |
Returns:
|
| 105 |
list: A list of dictionaries representing the dialogue.
|
| 106 |
"""
|
| 107 |
|
| 108 |
-
system_prompt = """You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two hosts:
|
| 109 |
- Host (Alex): Enthusiastic, asks clarifying questions, guides the conversation.
|
| 110 |
- Guest (Jamie): Expert researcher, explains concepts simply but accurately.
|
| 111 |
|
|
@@ -113,6 +96,8 @@ CRITICAL RULES:
|
|
| 113 |
1. The Host MUST ALWAYS start with "Welcome to PaperCast!" - This is the show's branding and must never be skipped.
|
| 114 |
2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
|
| 115 |
3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
|
|
|
|
|
|
|
| 116 |
|
| 117 |
Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
|
| 118 |
- "speaker": "Host" or "Guest"
|
|
@@ -121,14 +106,15 @@ Output the script in a valid JSON format. The JSON should be a list of objects,
|
|
| 121 |
|
| 122 |
Example:
|
| 123 |
[
|
| 124 |
-
{"speaker": "Host", "text": "Welcome to PaperCast! Today we're diving into something really cool.", "emotion": "excited"},
|
| 125 |
-
{"speaker": "Guest", "text": "That's right, Alex. We're looking at a new way to handle large language models.", "emotion": "happy"}
|
| 126 |
]
|
| 127 |
|
| 128 |
Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
|
|
|
|
| 129 |
"""
|
| 130 |
|
| 131 |
-
user_prompt = f"Here is the research paper text. Generate a podcast script summarizing the key findings, methodology, and implications.\n\n{paper_text[:10000]}..."
|
| 132 |
|
| 133 |
messages = [
|
| 134 |
{"role": "system", "content": system_prompt},
|
|
@@ -139,15 +125,81 @@ Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly
|
|
| 139 |
f"Generating script with {self.provider_mode} (model: {self.model_name})..."
|
| 140 |
)
|
| 141 |
|
| 142 |
-
# Call LLM
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
temperature
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
generated_text = response.choices[0].message.content
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Extract JSON from the response
|
| 152 |
try:
|
| 153 |
# Find the first '[' and last ']'
|
|
@@ -156,13 +208,28 @@ Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly
|
|
| 156 |
if start_index != -1 and end_index != -1:
|
| 157 |
json_str = generated_text[start_index:end_index]
|
| 158 |
script = json.loads(json_str)
|
|
|
|
| 159 |
return script
|
| 160 |
else:
|
| 161 |
-
print("No JSON found in output.")
|
|
|
|
|
|
|
|
|
|
| 162 |
return []
|
| 163 |
except json.JSONDecodeError as e:
|
| 164 |
-
print(f"Error parsing JSON: {e}")
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
return []
|
| 167 |
|
| 168 |
|
|
@@ -171,7 +238,7 @@ _generator_instance = None
|
|
| 171 |
|
| 172 |
|
| 173 |
def get_generator(
|
| 174 |
-
provider_mode="
|
| 175 |
own_base_url=None,
|
| 176 |
own_api_key=None,
|
| 177 |
own_model=None,
|
|
@@ -183,7 +250,7 @@ def get_generator(
|
|
| 183 |
Get a script generator instance with flexible provider support.
|
| 184 |
|
| 185 |
Args:
|
| 186 |
-
provider_mode: "
|
| 187 |
own_base_url: Base URL for own inference server
|
| 188 |
own_api_key: API key for own inference server
|
| 189 |
own_model: Model name for own inference server
|
|
@@ -196,41 +263,26 @@ def get_generator(
|
|
| 196 |
"""
|
| 197 |
global _generator_instance
|
| 198 |
|
| 199 |
-
#
|
| 200 |
-
# Reuse demo instance if same config
|
| 201 |
if provider_mode == "openai":
|
| 202 |
if not openai_key:
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
openai_key=openai_key,
|
| 211 |
-
openai_model=openai_model,
|
| 212 |
-
max_tokens=max_tokens or MAX_TOKENS,
|
| 213 |
-
)
|
| 214 |
|
| 215 |
if provider_mode == "own_inference":
|
| 216 |
if not own_base_url:
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
provider_mode="own_inference",
|
| 224 |
-
own_base_url=own_base_url,
|
| 225 |
-
own_api_key=own_api_key,
|
| 226 |
-
own_model=own_model,
|
| 227 |
-
max_tokens=max_tokens or MAX_TOKENS,
|
| 228 |
-
)
|
| 229 |
-
|
| 230 |
-
# Demo mode - reuse global instance
|
| 231 |
-
if _generator_instance is None or provider_mode == "demo":
|
| 232 |
-
_generator_instance = ScriptGenerator(
|
| 233 |
-
provider_mode="demo",
|
| 234 |
max_tokens=max_tokens or MAX_TOKENS,
|
| 235 |
)
|
| 236 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
import httpx
|
| 4 |
from openai import OpenAI
|
| 5 |
|
| 6 |
from utils.config import (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
MAX_TOKENS,
|
| 8 |
SCRIPT_GENERATION_MODEL,
|
| 9 |
TEMPERATURE,
|
|
|
|
| 13 |
class ScriptGenerator:
|
| 14 |
def __init__(
|
| 15 |
self,
|
| 16 |
+
provider_mode="own_inference",
|
| 17 |
own_base_url=None,
|
| 18 |
own_api_key=None,
|
| 19 |
own_model=None,
|
|
|
|
| 25 |
Initialize ScriptGenerator with flexible provider support.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
+
provider_mode: "own_inference" or "openai"
|
| 29 |
own_base_url: Base URL for own inference server
|
| 30 |
own_api_key: API key for own inference server
|
| 31 |
own_model: Model name for own inference server
|
|
|
|
| 36 |
self.provider_mode = provider_mode
|
| 37 |
self.max_tokens = max_tokens or MAX_TOKENS
|
| 38 |
|
| 39 |
+
if provider_mode == "own_inference":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Own inference server
|
| 41 |
print(f"Connecting to own inference API: {own_base_url}")
|
| 42 |
|
|
|
|
| 76 |
else:
|
| 77 |
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
| 78 |
|
| 79 |
+
def generate_podcast_script(self, paper_text: str, target_dialogue_count: int = 15) -> list:
|
| 80 |
"""
|
| 81 |
Generates a podcast script from the given paper text.
|
| 82 |
|
| 83 |
Args:
|
| 84 |
paper_text (str): The text content of the research paper.
|
| 85 |
+
target_dialogue_count (int): Target number of dialogue exchanges (default: 15)
|
| 86 |
|
| 87 |
Returns:
|
| 88 |
list: A list of dictionaries representing the dialogue.
|
| 89 |
"""
|
| 90 |
|
| 91 |
+
system_prompt = f"""You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two hosts:
|
| 92 |
- Host (Alex): Enthusiastic, asks clarifying questions, guides the conversation.
|
| 93 |
- Guest (Jamie): Expert researcher, explains concepts simply but accurately.
|
| 94 |
|
|
|
|
| 96 |
1. The Host MUST ALWAYS start with "Welcome to PaperCast!" - This is the show's branding and must never be skipped.
|
| 97 |
2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
|
| 98 |
3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
|
| 99 |
+
4. Generate EXACTLY {target_dialogue_count} dialogue exchanges (back-and-forth between Host and Guest). Do not exceed this count.
|
| 100 |
+
5. Each exchange should be substantive but concise. Keep individual dialogue turns focused and conversational.
|
| 101 |
|
| 102 |
Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
|
| 103 |
- "speaker": "Host" or "Guest"
|
|
|
|
| 106 |
|
| 107 |
Example:
|
| 108 |
[
|
| 109 |
+
{{"speaker": "Host", "text": "Welcome to PaperCast! Today we're diving into something really cool.", "emotion": "excited"}},
|
| 110 |
+
{{"speaker": "Guest", "text": "That's right, Alex. We're looking at a new way to handle large language models.", "emotion": "happy"}}
|
| 111 |
]
|
| 112 |
|
| 113 |
Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
|
| 114 |
+
IMPORTANT: Generate exactly {target_dialogue_count} dialogue items total. No more, no less.
|
| 115 |
"""
|
| 116 |
|
| 117 |
+
user_prompt = f"Here is the research paper text. Generate a podcast script with EXACTLY {target_dialogue_count} dialogue exchanges, summarizing the key findings, methodology, and implications.\n\n{paper_text[:10000]}..."
|
| 118 |
|
| 119 |
messages = [
|
| 120 |
{"role": "system", "content": system_prompt},
|
|
|
|
| 125 |
f"Generating script with {self.provider_mode} (model: {self.model_name})..."
|
| 126 |
)
|
| 127 |
|
| 128 |
+
# Call LLM with appropriate parameters
|
| 129 |
+
# OpenAI's newer models use max_completion_tokens instead of max_tokens
|
| 130 |
+
# All OpenAI models support JSON mode (response_format)
|
| 131 |
+
if self.provider_mode == "openai":
|
| 132 |
+
# Check if this is a reasoning model (o1, o3, gpt-5 series except gpt-5-chat)
|
| 133 |
+
# Reasoning models don't support temperature parameter
|
| 134 |
+
is_reasoning_model = any(
|
| 135 |
+
keyword in self.model_name.lower()
|
| 136 |
+
for keyword in ["o1", "o3", "o4", "gpt-5"]
|
| 137 |
+
) and "chat" not in self.model_name.lower()
|
| 138 |
+
|
| 139 |
+
# Common parameters for all OpenAI models
|
| 140 |
+
common_params = {
|
| 141 |
+
"model": self.model_name,
|
| 142 |
+
"messages": messages,
|
| 143 |
+
"response_format": {"type": "json_object"}, # JSON mode for all OpenAI models
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
if is_reasoning_model:
|
| 147 |
+
# Reasoning models: no temperature parameter
|
| 148 |
+
# Determine appropriate reasoning_effort based on model
|
| 149 |
+
model_lower = self.model_name.lower()
|
| 150 |
+
|
| 151 |
+
# GPT-5 series supports "minimal" (fastest)
|
| 152 |
+
# O-series only supports "low", "medium", "high"
|
| 153 |
+
# 2x multiplier: user's desired tokens + reasoning headroom
|
| 154 |
+
if "gpt-5" in model_lower:
|
| 155 |
+
# GPT-5, GPT-5-mini, GPT-5-nano, GPT-5.1 all support "minimal"
|
| 156 |
+
reasoning_effort_value = "minimal"
|
| 157 |
+
reasoning_tokens = self.max_tokens * 2 # 2x: desired output + minimal reasoning
|
| 158 |
+
effort_desc = "minimal (fastest for GPT-5 series)"
|
| 159 |
+
elif any(x in model_lower for x in ["o1-preview", "o1-mini"]):
|
| 160 |
+
# Old O-series don't support reasoning_effort parameter at all
|
| 161 |
+
reasoning_effort_value = None
|
| 162 |
+
reasoning_tokens = self.max_tokens * 2 # 2x for default reasoning
|
| 163 |
+
effort_desc = "default (no reasoning_effort support)"
|
| 164 |
+
else:
|
| 165 |
+
# O1, O3, O4 series support "low" as minimum
|
| 166 |
+
reasoning_effort_value = "low"
|
| 167 |
+
reasoning_tokens = self.max_tokens * 2 # 2x: desired output + low reasoning
|
| 168 |
+
effort_desc = "low (fastest for O-series)"
|
| 169 |
+
|
| 170 |
+
print(f" ℹ️ Reasoning model detected - temperature disabled, tokens increased to {reasoning_tokens}")
|
| 171 |
+
print(f" (Using {effort_desc} + JSON mode)")
|
| 172 |
+
|
| 173 |
+
# Add reasoning-specific parameters
|
| 174 |
+
common_params["max_completion_tokens"] = reasoning_tokens
|
| 175 |
+
if reasoning_effort_value:
|
| 176 |
+
common_params["reasoning_effort"] = reasoning_effort_value
|
| 177 |
+
|
| 178 |
+
response = self.client.chat.completions.create(**common_params)
|
| 179 |
+
else:
|
| 180 |
+
# Regular chat models: include temperature
|
| 181 |
+
print(f" ℹ️ Chat model with JSON mode enabled")
|
| 182 |
+
common_params["max_completion_tokens"] = self.max_tokens
|
| 183 |
+
common_params["temperature"] = TEMPERATURE
|
| 184 |
+
response = self.client.chat.completions.create(**common_params)
|
| 185 |
+
else:
|
| 186 |
+
# Own inference servers typically use max_tokens
|
| 187 |
+
response = self.client.chat.completions.create(
|
| 188 |
+
model=self.model_name,
|
| 189 |
+
messages=messages,
|
| 190 |
+
max_tokens=self.max_tokens,
|
| 191 |
+
temperature=TEMPERATURE,
|
| 192 |
+
)
|
| 193 |
generated_text = response.choices[0].message.content
|
| 194 |
|
| 195 |
+
# Debug: Print raw output info
|
| 196 |
+
print(f" 📄 Response length: {len(generated_text) if generated_text else 0} characters")
|
| 197 |
+
|
| 198 |
+
if not generated_text:
|
| 199 |
+
print("❌ Error: Model returned empty response")
|
| 200 |
+
print(f"Full response object: {response}")
|
| 201 |
+
return []
|
| 202 |
+
|
| 203 |
# Extract JSON from the response
|
| 204 |
try:
|
| 205 |
# Find the first '[' and last ']'
|
|
|
|
| 208 |
if start_index != -1 and end_index != -1:
|
| 209 |
json_str = generated_text[start_index:end_index]
|
| 210 |
script = json.loads(json_str)
|
| 211 |
+
print(f" ✅ Successfully parsed {len(script)} dialogue items")
|
| 212 |
return script
|
| 213 |
else:
|
| 214 |
+
print("❌ No JSON found in output.")
|
| 215 |
+
print(f"📝 Raw output preview (first 500 chars):\n{generated_text[:500]}")
|
| 216 |
+
if len(generated_text) > 500:
|
| 217 |
+
print(f"... (truncated, total length: {len(generated_text)})")
|
| 218 |
return []
|
| 219 |
except json.JSONDecodeError as e:
|
| 220 |
+
print(f"❌ Error parsing JSON: {e}")
|
| 221 |
+
|
| 222 |
+
# Show context around the error location
|
| 223 |
+
if hasattr(e, 'pos') and e.pos:
|
| 224 |
+
error_pos = e.pos
|
| 225 |
+
context_start = max(0, error_pos - 200)
|
| 226 |
+
context_end = min(len(generated_text), error_pos + 200)
|
| 227 |
+
print(f"\n📍 Error at position {error_pos}:")
|
| 228 |
+
print(f"Context:\n...{generated_text[context_start:context_end]}...")
|
| 229 |
+
|
| 230 |
+
print(f"\n📝 Full output (first 1000 chars):\n{generated_text[:1000]}")
|
| 231 |
+
if len(generated_text) > 1000:
|
| 232 |
+
print(f"\n... (truncated, total length: {len(generated_text)} chars)")
|
| 233 |
return []
|
| 234 |
|
| 235 |
|
|
|
|
| 238 |
|
| 239 |
|
| 240 |
def get_generator(
|
| 241 |
+
provider_mode="own_inference",
|
| 242 |
own_base_url=None,
|
| 243 |
own_api_key=None,
|
| 244 |
own_model=None,
|
|
|
|
| 250 |
Get a script generator instance with flexible provider support.
|
| 251 |
|
| 252 |
Args:
|
| 253 |
+
provider_mode: "own_inference" or "openai"
|
| 254 |
own_base_url: Base URL for own inference server
|
| 255 |
own_api_key: API key for own inference server
|
| 256 |
own_model: Model name for own inference server
|
|
|
|
| 263 |
"""
|
| 264 |
global _generator_instance
|
| 265 |
|
| 266 |
+
# Create instance based on provider mode
|
|
|
|
| 267 |
if provider_mode == "openai":
|
| 268 |
if not openai_key:
|
| 269 |
+
raise ValueError("OpenAI API key is required for OpenAI provider mode")
|
| 270 |
+
return ScriptGenerator(
|
| 271 |
+
provider_mode="openai",
|
| 272 |
+
openai_key=openai_key,
|
| 273 |
+
openai_model=openai_model,
|
| 274 |
+
max_tokens=max_tokens or MAX_TOKENS,
|
| 275 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
if provider_mode == "own_inference":
|
| 278 |
if not own_base_url:
|
| 279 |
+
raise ValueError("Base URL is required for own inference provider mode")
|
| 280 |
+
return ScriptGenerator(
|
| 281 |
+
provider_mode="own_inference",
|
| 282 |
+
own_base_url=own_base_url,
|
| 283 |
+
own_api_key=own_api_key,
|
| 284 |
+
own_model=own_model,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
max_tokens=max_tokens or MAX_TOKENS,
|
| 286 |
)
|
| 287 |
+
|
| 288 |
+
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
live.py
CHANGED
|
@@ -3,50 +3,65 @@ import subprocess
|
|
| 3 |
import datetime
|
| 4 |
|
| 5 |
# ---------------------------------------------------------------------------
|
| 6 |
-
#
|
| 7 |
-
# Örnek: curl -X POST http://api.example.com/update
|
| 8 |
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
CURL_COMMAND = """
|
| 10 |
-
curl --location '
|
| 11 |
--header 'Content-Type: application/json' \
|
| 12 |
-
--header 'Authorization:
|
| 13 |
--data '{
|
| 14 |
-
"model": "
|
| 15 |
"messages": [
|
| 16 |
{
|
| 17 |
"role": "user",
|
| 18 |
-
"content": "
|
| 19 |
}
|
| 20 |
]
|
| 21 |
}'
|
| 22 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# ---------------------------------------------------------------------------
|
| 24 |
|
| 25 |
def run_periodically():
|
| 26 |
-
print(f"
|
| 27 |
-
print(f"
|
| 28 |
print("-" * 50)
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
while True:
|
| 31 |
try:
|
| 32 |
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 33 |
-
print(f"[{current_time}]
|
| 34 |
-
|
| 35 |
-
# shell=True, komutun terminaldeki gibi çalışmasını sağlar
|
| 36 |
result = subprocess.run(CURL_COMMAND, shell=True, capture_output=True, text=True)
|
| 37 |
-
|
| 38 |
if result.returncode == 0:
|
| 39 |
-
print(f"
|
| 40 |
else:
|
| 41 |
-
print(f"
|
| 42 |
-
print(f"
|
| 43 |
-
|
| 44 |
except Exception as e:
|
| 45 |
-
print(f"
|
| 46 |
-
|
| 47 |
-
print("
|
| 48 |
print("-" * 50)
|
| 49 |
-
time.sleep(
|
| 50 |
|
| 51 |
if __name__ == "__main__":
|
| 52 |
run_periodically()
|
|
|
|
| 3 |
import datetime
|
| 4 |
|
| 5 |
# ---------------------------------------------------------------------------
|
| 6 |
+
# OPTIONAL: Keep-Alive Script for Inference Servers
|
|
|
|
| 7 |
# ---------------------------------------------------------------------------
|
| 8 |
+
# This script sends periodic requests to keep inference servers active.
|
| 9 |
+
# Configure your own endpoint and credentials below if needed.
|
| 10 |
+
#
|
| 11 |
+
# Note: This is optional and only useful if you're hosting your own
|
| 12 |
+
# inference server that goes to sleep after inactivity.
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
# Replace with your own inference endpoint and credentials
|
| 16 |
CURL_COMMAND = """
|
| 17 |
+
curl --location 'YOUR_INFERENCE_URL_HERE' \
|
| 18 |
--header 'Content-Type: application/json' \
|
| 19 |
+
--header 'Authorization: YOUR_AUTH_HEADER_HERE' \
|
| 20 |
--data '{
|
| 21 |
+
"model": "your-model-name",
|
| 22 |
"messages": [
|
| 23 |
{
|
| 24 |
"role": "user",
|
| 25 |
+
"content": "Hello, this is a keep-alive ping."
|
| 26 |
}
|
| 27 |
]
|
| 28 |
}'
|
| 29 |
"""
|
| 30 |
+
|
| 31 |
+
# How often to send requests (in seconds)
|
| 32 |
+
INTERVAL_SECONDS = 60
|
| 33 |
+
|
| 34 |
# ---------------------------------------------------------------------------
|
| 35 |
|
| 36 |
def run_periodically():
|
| 37 |
+
print(f"Keep-alive script started: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 38 |
+
print(f"Interval: {INTERVAL_SECONDS} seconds")
|
| 39 |
print("-" * 50)
|
| 40 |
+
|
| 41 |
+
if "YOUR_INFERENCE_URL_HERE" in CURL_COMMAND:
|
| 42 |
+
print("⚠️ WARNING: Please configure CURL_COMMAND with your actual endpoint!")
|
| 43 |
+
print("⚠️ Edit this file and replace the placeholder values.")
|
| 44 |
+
return
|
| 45 |
+
|
| 46 |
while True:
|
| 47 |
try:
|
| 48 |
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 49 |
+
print(f"[{current_time}] Sending keep-alive request...")
|
| 50 |
+
|
|
|
|
| 51 |
result = subprocess.run(CURL_COMMAND, shell=True, capture_output=True, text=True)
|
| 52 |
+
|
| 53 |
if result.returncode == 0:
|
| 54 |
+
print(f"✓ Success! Response (first 100 chars): {result.stdout[:100]}...")
|
| 55 |
else:
|
| 56 |
+
print(f"✗ Error code: {result.returncode}")
|
| 57 |
+
print(f"Error output: {result.stderr}")
|
| 58 |
+
|
| 59 |
except Exception as e:
|
| 60 |
+
print(f"Unexpected error: {e}")
|
| 61 |
+
|
| 62 |
+
print(f"Waiting {INTERVAL_SECONDS} seconds...")
|
| 63 |
print("-" * 50)
|
| 64 |
+
time.sleep(INTERVAL_SECONDS)
|
| 65 |
|
| 66 |
if __name__ == "__main__":
|
| 67 |
run_periodically()
|
output/history.json
CHANGED
|
@@ -54,5 +54,12 @@
|
|
| 54 |
"script_length": 11,
|
| 55 |
"timestamp": "2025-11-19 23:07:42",
|
| 56 |
"audio_filename": "podcast_20251119_230742.wav"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
]
|
|
|
|
| 54 |
"script_length": 11,
|
| 55 |
"timestamp": "2025-11-19 23:07:42",
|
| 56 |
"audio_filename": "podcast_20251119_230742.wav"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"url": "https://arxiv.org/abs/2204.06125",
|
| 60 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251121_221210.wav",
|
| 61 |
+
"script_length": 21,
|
| 62 |
+
"timestamp": "2025-11-21 22:12:10",
|
| 63 |
+
"audio_filename": "podcast_20251121_221210.wav"
|
| 64 |
}
|
| 65 |
]
|
requirements.txt
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
beautifulsoup4
|
| 2 |
-
edge-tts
|
| 3 |
elevenlabs
|
| 4 |
gradio
|
| 5 |
mcp
|
|
|
|
| 1 |
beautifulsoup4
|
|
|
|
| 2 |
elevenlabs
|
| 3 |
gradio
|
| 4 |
mcp
|
synthesis/tts_engine.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import os
|
| 3 |
from datetime import datetime
|
| 4 |
from io import BytesIO
|
| 5 |
|
| 6 |
-
import edge_tts
|
| 7 |
from elevenlabs import ElevenLabs, VoiceSettings
|
| 8 |
from pydub import AudioSegment
|
| 9 |
|
|
@@ -14,38 +12,7 @@ from utils.config import (
|
|
| 14 |
OUTPUT_DIR,
|
| 15 |
)
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
EDGE_TTS_VOICES = {
|
| 19 |
-
# English (US) - Male
|
| 20 |
-
"Guy (US Male - Casual)": "en-US-GuyNeural",
|
| 21 |
-
"Christopher (US Male - Authoritative)": "en-US-ChristopherNeural",
|
| 22 |
-
"Eric (US Male - Professional)": "en-US-EricNeural",
|
| 23 |
-
"Steffan (US Male - Energetic)": "en-US-SteffanNeural",
|
| 24 |
-
"Roger (US Male - Elderly)": "en-US-RogerNeural",
|
| 25 |
-
|
| 26 |
-
# English (US) - Female
|
| 27 |
-
"Jenny (US Female - Friendly)": "en-US-JennyNeural",
|
| 28 |
-
"Aria (US Female - Professional)": "en-US-AriaNeural",
|
| 29 |
-
"Michelle (US Female - Enthusiastic)": "en-US-MichelleNeural",
|
| 30 |
-
"Sara (US Female - News Anchor)": "en-US-SaraNeural",
|
| 31 |
-
"Ana (US Female - Child)": "en-US-AnaNeural",
|
| 32 |
-
|
| 33 |
-
# English (UK)
|
| 34 |
-
"Ryan (UK Male)": "en-GB-RyanNeural",
|
| 35 |
-
"Thomas (UK Male - Elderly)": "en-GB-ThomasNeural",
|
| 36 |
-
"Sonia (UK Female)": "en-GB-SoniaNeural",
|
| 37 |
-
"Libby (UK Female - Enthusiastic)": "en-GB-LibbyNeural",
|
| 38 |
-
|
| 39 |
-
# English (Australia)
|
| 40 |
-
"William (AU Male)": "en-AU-WilliamNeural",
|
| 41 |
-
"Natasha (AU Female)": "en-AU-NatashaNeural",
|
| 42 |
-
|
| 43 |
-
# English (India)
|
| 44 |
-
"Prabhat (IN Male)": "en-IN-PrabhatNeural",
|
| 45 |
-
"Neerja (IN Female)": "en-IN-NeerjaNeural",
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
# ElevenLabs Voice Options (popular voices)
|
| 49 |
ELEVENLABS_VOICES = {
|
| 50 |
# Male Voices
|
| 51 |
"Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
|
|
@@ -80,58 +47,45 @@ def generate_unique_filename():
|
|
| 80 |
|
| 81 |
|
| 82 |
class TTSEngine:
|
| 83 |
-
def __init__(self, tts_provider="
|
| 84 |
"""
|
| 85 |
-
Initialize TTS Engine with
|
| 86 |
|
| 87 |
Args:
|
| 88 |
-
tts_provider:
|
| 89 |
-
custom_api_key: API key for ElevenLabs (
|
| 90 |
-
host_voice: Voice ID
|
| 91 |
-
guest_voice: Voice ID
|
| 92 |
"""
|
| 93 |
-
self.mode =
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
|
| 104 |
-
|
| 105 |
-
if custom_api_key:
|
| 106 |
-
print("✓ ElevenLabs TTS ready (custom API key)")
|
| 107 |
-
else:
|
| 108 |
-
print("✓ ElevenLabs TTS ready (demo API key)")
|
| 109 |
-
|
| 110 |
-
# Print selected voices
|
| 111 |
-
host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
|
| 112 |
-
guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
|
| 113 |
-
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
|
| 114 |
-
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
|
| 115 |
-
|
| 116 |
-
elif self.mode == "edge-tts":
|
| 117 |
-
print("Initializing Edge-TTS (Microsoft)...")
|
| 118 |
-
# Use custom voices or defaults
|
| 119 |
-
self.host_voice = host_voice if host_voice else "en-US-GuyNeural"
|
| 120 |
-
self.guest_voice = guest_voice if guest_voice else "en-US-JennyNeural"
|
| 121 |
-
print("✓ Edge-TTS ready (free, no API key required)")
|
| 122 |
-
|
| 123 |
-
# Print selected voices
|
| 124 |
-
host_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.host_voice]
|
| 125 |
-
guest_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.guest_voice]
|
| 126 |
-
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
|
| 127 |
-
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
else:
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
def synthesize_dialogue(self, script: list) -> str:
|
| 133 |
"""
|
| 134 |
-
Synthesize the script to audio using
|
| 135 |
|
| 136 |
Args:
|
| 137 |
script: List of dialogue items
|
|
@@ -139,12 +93,7 @@ class TTSEngine:
|
|
| 139 |
Returns:
|
| 140 |
str: Path to the generated audio file
|
| 141 |
"""
|
| 142 |
-
|
| 143 |
-
return self._synthesize_elevenlabs(script)
|
| 144 |
-
elif self.mode == "edge-tts":
|
| 145 |
-
return self._synthesize_edge_tts(script)
|
| 146 |
-
else:
|
| 147 |
-
raise ValueError(f"Unknown TTS mode: {self.mode}")
|
| 148 |
|
| 149 |
def _synthesize_elevenlabs(self, script: list) -> str:
|
| 150 |
"""Synthesize using ElevenLabs API"""
|
|
@@ -154,23 +103,30 @@ class TTSEngine:
|
|
| 154 |
for i, item in enumerate(script):
|
| 155 |
text = item["text"]
|
| 156 |
speaker = item["speaker"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
# Select voice based on speaker
|
| 159 |
voice_id = self.guest_voice_id if speaker == "Guest" else self.host_voice_id
|
| 160 |
|
| 161 |
try:
|
| 162 |
-
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
|
| 163 |
|
| 164 |
-
# Generate audio using ElevenLabs
|
|
|
|
| 165 |
audio_generator = self.client.text_to_speech.convert(
|
| 166 |
voice_id=voice_id,
|
| 167 |
text=text,
|
| 168 |
-
model_id="
|
| 169 |
voice_settings=VoiceSettings(
|
| 170 |
-
stability=0.5
|
| 171 |
-
similarity_boost=0.75
|
| 172 |
-
style=0.5
|
| 173 |
-
use_speaker_boost=True,
|
| 174 |
),
|
| 175 |
)
|
| 176 |
|
|
@@ -207,123 +163,20 @@ class TTSEngine:
|
|
| 207 |
|
| 208 |
return output_path
|
| 209 |
|
| 210 |
-
def _synthesize_edge_tts(self, script: list) -> str:
|
| 211 |
-
"""Synthesize using Edge-TTS (Microsoft)"""
|
| 212 |
-
print("Synthesizing audio via Edge-TTS (Microsoft)...")
|
| 213 |
-
audio_segments = []
|
| 214 |
-
|
| 215 |
-
for i, item in enumerate(script):
|
| 216 |
-
text = item["text"]
|
| 217 |
-
speaker = item["speaker"]
|
| 218 |
-
|
| 219 |
-
# Select voice based on speaker
|
| 220 |
-
voice = self.guest_voice if speaker == "Guest" else self.host_voice
|
| 221 |
-
|
| 222 |
-
try:
|
| 223 |
-
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
|
| 224 |
-
|
| 225 |
-
# Generate audio using Edge-TTS (synchronous wrapper for async)
|
| 226 |
-
audio_bytes = asyncio.run(self._edge_tts_synthesize(text, voice))
|
| 227 |
-
|
| 228 |
-
# Convert to AudioSegment
|
| 229 |
-
audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
|
| 230 |
-
|
| 231 |
-
# Trim silence from the end of the audio (Edge-TTS adds trailing silence)
|
| 232 |
-
# Detect silence threshold: -40 dBFS
|
| 233 |
-
audio_segment = self._trim_silence(audio_segment)
|
| 234 |
-
|
| 235 |
-
audio_segments.append(audio_segment)
|
| 236 |
-
|
| 237 |
-
# Add minimal silence between speakers (50ms for natural flow)
|
| 238 |
-
silence = AudioSegment.silent(duration=50)
|
| 239 |
-
audio_segments.append(silence)
|
| 240 |
-
|
| 241 |
-
print(f"✓ Synthesized line {i + 1}/{len(script)}")
|
| 242 |
-
|
| 243 |
-
except Exception as e:
|
| 244 |
-
print(f"Error synthesizing line '{text[:50]}...': {e}")
|
| 245 |
-
# Continue with next line even if one fails
|
| 246 |
-
|
| 247 |
-
if not audio_segments:
|
| 248 |
-
print("No audio generated")
|
| 249 |
-
return ""
|
| 250 |
-
|
| 251 |
-
# Combine all segments
|
| 252 |
-
print("Combining audio segments...")
|
| 253 |
-
combined = sum(audio_segments)
|
| 254 |
-
|
| 255 |
-
# Export as WAV with unique filename
|
| 256 |
-
filename = generate_unique_filename()
|
| 257 |
-
output_path = os.path.join(OUTPUT_DIR, filename)
|
| 258 |
-
combined.export(output_path, format="wav")
|
| 259 |
-
print(f"✓ Podcast saved to: {output_path}")
|
| 260 |
-
|
| 261 |
-
return output_path
|
| 262 |
-
|
| 263 |
-
async def _edge_tts_synthesize(self, text: str, voice: str) -> bytes:
|
| 264 |
-
"""
|
| 265 |
-
Async helper to synthesize text using Edge-TTS.
|
| 266 |
-
|
| 267 |
-
Args:
|
| 268 |
-
text: Text to synthesize
|
| 269 |
-
voice: Voice name to use
|
| 270 |
-
|
| 271 |
-
Returns:
|
| 272 |
-
bytes: Audio data in MP3 format
|
| 273 |
-
"""
|
| 274 |
-
communicate = edge_tts.Communicate(text, voice)
|
| 275 |
-
audio_data = b""
|
| 276 |
-
|
| 277 |
-
async for chunk in communicate.stream():
|
| 278 |
-
if chunk["type"] == "audio":
|
| 279 |
-
audio_data += chunk["data"]
|
| 280 |
-
|
| 281 |
-
return audio_data
|
| 282 |
-
|
| 283 |
-
def _trim_silence(self, audio_segment, silence_thresh=-40, chunk_size=10):
|
| 284 |
-
"""
|
| 285 |
-
Trim silence from the end of audio segment.
|
| 286 |
-
|
| 287 |
-
Args:
|
| 288 |
-
audio_segment: AudioSegment to trim
|
| 289 |
-
silence_thresh: Silence threshold in dBFS (default: -40)
|
| 290 |
-
chunk_size: Size of chunks to analyze in ms (default: 10)
|
| 291 |
-
|
| 292 |
-
Returns:
|
| 293 |
-
Trimmed AudioSegment
|
| 294 |
-
"""
|
| 295 |
-
# Start from the end and find where audio actually ends
|
| 296 |
-
trim_ms = 0
|
| 297 |
-
|
| 298 |
-
# Check from the end in chunks
|
| 299 |
-
for i in range(len(audio_segment) - chunk_size, 0, -chunk_size):
|
| 300 |
-
chunk = audio_segment[i:i + chunk_size]
|
| 301 |
-
if chunk.dBFS > silence_thresh:
|
| 302 |
-
# Found non-silent audio
|
| 303 |
-
trim_ms = i + chunk_size
|
| 304 |
-
break
|
| 305 |
-
|
| 306 |
-
# If we found non-silent audio, trim there
|
| 307 |
-
if trim_ms > 0:
|
| 308 |
-
return audio_segment[:trim_ms]
|
| 309 |
-
|
| 310 |
-
# Otherwise return original
|
| 311 |
-
return audio_segment
|
| 312 |
-
|
| 313 |
|
| 314 |
# Global instance
|
| 315 |
_tts_instance = None
|
| 316 |
|
| 317 |
|
| 318 |
-
def get_tts_engine(tts_provider="
|
| 319 |
"""
|
| 320 |
-
Get TTS engine instance with
|
| 321 |
|
| 322 |
Args:
|
| 323 |
-
tts_provider:
|
| 324 |
-
custom_api_key:
|
| 325 |
-
host_voice: Voice ID
|
| 326 |
-
guest_voice: Voice ID
|
| 327 |
|
| 328 |
Returns:
|
| 329 |
TTSEngine instance
|
|
@@ -331,7 +184,7 @@ def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None
|
|
| 331 |
global _tts_instance
|
| 332 |
|
| 333 |
# Always create new instance if custom settings provided
|
| 334 |
-
if custom_api_key or tts_provider != "
|
| 335 |
return TTSEngine(
|
| 336 |
tts_provider=tts_provider,
|
| 337 |
custom_api_key=custom_api_key,
|
|
@@ -339,7 +192,7 @@ def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None
|
|
| 339 |
guest_voice=guest_voice
|
| 340 |
)
|
| 341 |
|
| 342 |
-
# Otherwise, reuse global instance (for default
|
| 343 |
if _tts_instance is None:
|
| 344 |
-
_tts_instance = TTSEngine(tts_provider="
|
| 345 |
return _tts_instance
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from datetime import datetime
|
| 3 |
from io import BytesIO
|
| 4 |
|
|
|
|
| 5 |
from elevenlabs import ElevenLabs, VoiceSettings
|
| 6 |
from pydub import AudioSegment
|
| 7 |
|
|
|
|
| 12 |
OUTPUT_DIR,
|
| 13 |
)
|
| 14 |
|
| 15 |
+
# ElevenLabs Voice Options
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
ELEVENLABS_VOICES = {
|
| 17 |
# Male Voices
|
| 18 |
"Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
class TTSEngine:
|
| 50 |
+
def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
|
| 51 |
"""
|
| 52 |
+
Initialize TTS Engine with ElevenLabs.
|
| 53 |
|
| 54 |
Args:
|
| 55 |
+
tts_provider: Must be "elevenlabs" (kept for compatibility)
|
| 56 |
+
custom_api_key: API key for ElevenLabs (required)
|
| 57 |
+
host_voice: Voice ID for Host (optional, uses default if not provided)
|
| 58 |
+
guest_voice: Voice ID for Guest (optional, uses default if not provided)
|
| 59 |
"""
|
| 60 |
+
self.mode = "elevenlabs"
|
| 61 |
+
|
| 62 |
+
print("Initializing ElevenLabs TTS API...")
|
| 63 |
+
# Use custom key if provided, otherwise use default
|
| 64 |
+
api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY
|
| 65 |
+
|
| 66 |
+
if not api_key:
|
| 67 |
+
raise ValueError("ElevenLabs API key is required")
|
| 68 |
+
|
| 69 |
+
self.client = ElevenLabs(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# Use custom voices or defaults
|
| 72 |
+
self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
|
| 73 |
+
self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
|
| 74 |
+
|
| 75 |
+
if custom_api_key:
|
| 76 |
+
print("✓ ElevenLabs TTS ready (custom API key)")
|
| 77 |
else:
|
| 78 |
+
print("✓ ElevenLabs TTS ready")
|
| 79 |
+
|
| 80 |
+
# Print selected voices
|
| 81 |
+
host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
|
| 82 |
+
guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
|
| 83 |
+
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
|
| 84 |
+
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
|
| 85 |
|
| 86 |
def synthesize_dialogue(self, script: list) -> str:
|
| 87 |
"""
|
| 88 |
+
Synthesize the script to audio using ElevenLabs.
|
| 89 |
|
| 90 |
Args:
|
| 91 |
script: List of dialogue items
|
|
|
|
| 93 |
Returns:
|
| 94 |
str: Path to the generated audio file
|
| 95 |
"""
|
| 96 |
+
return self._synthesize_elevenlabs(script)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def _synthesize_elevenlabs(self, script: list) -> str:
|
| 99 |
"""Synthesize using ElevenLabs API"""
|
|
|
|
| 103 |
for i, item in enumerate(script):
|
| 104 |
text = item["text"]
|
| 105 |
speaker = item["speaker"]
|
| 106 |
+
emotion = item.get("emotion", "neutral")
|
| 107 |
+
|
| 108 |
+
# Note: ElevenLabs doesn't have a direct emotion parameter.
|
| 109 |
+
# Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.)
|
| 110 |
+
# which the script generator already creates based on the emotion field.
|
| 111 |
+
# We log the emotion for debugging but don't modify the text (would be spoken out loud).
|
| 112 |
|
| 113 |
# Select voice based on speaker
|
| 114 |
voice_id = self.guest_voice_id if speaker == "Guest" else self.host_voice_id
|
| 115 |
|
| 116 |
try:
|
| 117 |
+
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
|
| 118 |
|
| 119 |
+
# Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed
|
| 120 |
+
# Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2
|
| 121 |
audio_generator = self.client.text_to_speech.convert(
|
| 122 |
voice_id=voice_id,
|
| 123 |
text=text,
|
| 124 |
+
model_id="eleven_turbo_v2_5", # Upgraded from multilingual_v2 for better quality
|
| 125 |
voice_settings=VoiceSettings(
|
| 126 |
+
stability=0.4, # Lower = more expressiveness and variation (default: 0.5)
|
| 127 |
+
similarity_boost=0.8, # Higher = better voice consistency (default: 0.75)
|
| 128 |
+
style=0.6, # Higher = more dynamic, expressive delivery (default: 0.5)
|
| 129 |
+
use_speaker_boost=True, # Enhances similarity to original voice
|
| 130 |
),
|
| 131 |
)
|
| 132 |
|
|
|
|
| 163 |
|
| 164 |
return output_path
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# Global instance
|
| 168 |
_tts_instance = None
|
| 169 |
|
| 170 |
|
| 171 |
+
def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
|
| 172 |
"""
|
| 173 |
+
Get TTS engine instance with ElevenLabs.
|
| 174 |
|
| 175 |
Args:
|
| 176 |
+
tts_provider: Must be "elevenlabs" (kept for compatibility)
|
| 177 |
+
custom_api_key: ElevenLabs API key (required)
|
| 178 |
+
host_voice: Voice ID for Host (optional)
|
| 179 |
+
guest_voice: Voice ID for Guest (optional)
|
| 180 |
|
| 181 |
Returns:
|
| 182 |
TTSEngine instance
|
|
|
|
| 184 |
global _tts_instance
|
| 185 |
|
| 186 |
# Always create new instance if custom settings provided
|
| 187 |
+
if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice:
|
| 188 |
return TTSEngine(
|
| 189 |
tts_provider=tts_provider,
|
| 190 |
custom_api_key=custom_api_key,
|
|
|
|
| 192 |
guest_voice=guest_voice
|
| 193 |
)
|
| 194 |
|
| 195 |
+
# Otherwise, reuse global instance (for default ElevenLabs)
|
| 196 |
if _tts_instance is None:
|
| 197 |
+
_tts_instance = TTSEngine(tts_provider="elevenlabs")
|
| 198 |
return _tts_instance
|
utils/config.py
CHANGED
|
@@ -5,39 +5,23 @@ from dotenv import load_dotenv
|
|
| 5 |
# Load environment variables from .env.local
|
| 6 |
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env.local"))
|
| 7 |
|
| 8 |
-
# Demo Mode Configuration - Load from environment variable
|
| 9 |
-
# Set DEMO_MODE=true in .env.local or HuggingFace Spaces secrets
|
| 10 |
-
DEMO_MODE = True
|
| 11 |
-
|
| 12 |
# Model Configurations
|
| 13 |
SCRIPT_GENERATION_MODEL = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# TTS API Settings (ElevenLabs)
|
| 20 |
-
|
| 21 |
-
# Load from .env.local
|
| 22 |
-
ELEVENLABS_API_KEY = os.getenv("DEMO_TTS_KEY")
|
| 23 |
|
| 24 |
# ElevenLabs Voice IDs (you can change these to different voices)
|
| 25 |
# Find more voices at: https://api.elevenlabs.io/v1/voices
|
| 26 |
ELEVENLABS_HOST_VOICE = "ErXwobaYiN019PkySvjV" # Antoni - male voice for Host
|
| 27 |
ELEVENLABS_GUEST_VOICE = "EXAVITQu4vr4xnSDxMaL" # Bella - female voice for Guest
|
| 28 |
|
| 29 |
-
# Demo Mode Settings (loaded from .env.local)
|
| 30 |
-
DEMO_INFERENCE_URL = INFERENCE_API_URL
|
| 31 |
-
DEMO_INFERENCE_KEY = INFERENCE_API_KEY
|
| 32 |
-
DEMO_MODEL = SCRIPT_GENERATION_MODEL
|
| 33 |
-
DEMO_TTS_KEY = ELEVENLABS_API_KEY
|
| 34 |
-
|
| 35 |
-
# Optional: Additional API keys for non-demo mode
|
| 36 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 37 |
-
CUSTOM_ELEVENLABS_KEY = os.getenv("CUSTOM_ELEVENLABS_KEY", "")
|
| 38 |
-
CUSTOM_INFERENCE_URL = os.getenv("CUSTOM_INFERENCE_URL", "")
|
| 39 |
-
CUSTOM_INFERENCE_KEY = os.getenv("CUSTOM_INFERENCE_KEY", "")
|
| 40 |
-
|
| 41 |
# Paths
|
| 42 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 43 |
TEMP_DIR = os.path.join(BASE_DIR, "temp")
|
|
|
|
| 5 |
# Load environment variables from .env.local
|
| 6 |
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env.local"))
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Model Configurations
|
| 9 |
SCRIPT_GENERATION_MODEL = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
|
| 10 |
|
| 11 |
+
# User API Keys (Bring Your Own Key - BYOK)
|
| 12 |
+
# Users provide these through the settings interface or environment variables
|
| 13 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 14 |
+
INFERENCE_API_URL = os.getenv("INFERENCE_API_URL", "")
|
| 15 |
+
INFERENCE_API_KEY = os.getenv("INFERENCE_API_KEY", "")
|
| 16 |
|
| 17 |
# TTS API Settings (ElevenLabs)
|
| 18 |
+
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# ElevenLabs Voice IDs (you can change these to different voices)
|
| 21 |
# Find more voices at: https://api.elevenlabs.io/v1/voices
|
| 22 |
ELEVENLABS_HOST_VOICE = "ErXwobaYiN019PkySvjV" # Antoni - male voice for Host
|
| 23 |
ELEVENLABS_GUEST_VOICE = "EXAVITQu4vr4xnSDxMaL" # Bella - female voice for Guest
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Paths
|
| 26 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 27 |
TEMP_DIR = os.path.join(BASE_DIR, "temp")
|