batuhanozkose commited on
Commit
3ab234f
·
1 Parent(s): 472739a

update tts, llm engines

Browse files
agents/podcast_agent.py CHANGED
@@ -5,11 +5,6 @@ from processing.pdf_reader import extract_text_from_pdf
5
  from processing.url_fetcher import fetch_paper_from_url
6
  from synthesis.tts_engine import get_tts_engine
7
  from utils.config import (
8
- DEMO_INFERENCE_KEY,
9
- DEMO_INFERENCE_URL,
10
- DEMO_MODE,
11
- DEMO_MODEL,
12
- DEMO_TTS_KEY,
13
  MAX_CONTEXT_CHARS,
14
  )
15
  from utils.history import save_to_history
@@ -18,45 +13,49 @@ from utils.history import save_to_history
18
  class PodcastAgent:
19
  def __init__(
20
  self,
21
- provider_mode="demo",
22
  own_base_url=None,
23
  own_api_key=None,
24
  own_model=None,
25
  openai_key=None,
26
  openai_model=None,
27
- tts_provider="edge-tts",
28
  elevenlabs_key=None,
29
  host_voice=None,
30
  guest_voice=None,
31
  max_tokens=None,
 
32
  ):
33
- self.logs = []
34
-
35
- # If demo mode is enabled, override all settings with demo credentials
36
- if DEMO_MODE:
37
- self.provider_mode = "demo"
38
- self.own_base_url = DEMO_INFERENCE_URL
39
- self.own_api_key = DEMO_INFERENCE_KEY
40
- self.own_model = DEMO_MODEL
41
- self.openai_key = None
42
- self.openai_model = None
43
- self.tts_provider = "edge-tts" # Always use Edge-TTS in demo mode
44
- self.elevenlabs_key = None
45
- self.host_voice = host_voice
46
- self.guest_voice = guest_voice
47
- else:
48
- self.provider_mode = provider_mode # "own_inference" or "openai"
49
- self.own_base_url = own_base_url
50
- self.own_api_key = own_api_key
51
- self.own_model = own_model
52
- self.openai_key = openai_key
53
- self.openai_model = openai_model
54
- self.tts_provider = tts_provider
55
- self.elevenlabs_key = elevenlabs_key
56
- self.host_voice = host_voice
57
- self.guest_voice = guest_voice
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  self.max_tokens = max_tokens
 
60
 
61
  def log(self, message):
62
  timestamp = time.strftime("%H:%M:%S")
@@ -127,21 +126,15 @@ class PodcastAgent:
127
  openai_model=self.openai_model,
128
  max_tokens=self.max_tokens,
129
  )
130
- script = generator.generate_podcast_script(text)
131
  if not script:
132
  yield self.log("Error: Failed to generate script.")
133
  return None, self.logs
134
- yield self.log(f"Generated script with {len(script)} dialogue turns.")
135
 
136
  # Step 4: Synthesize Audio
137
  yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
138
- if self.tts_provider == "edge-tts":
139
- yield self.log("Using Edge-TTS (Microsoft, free)")
140
- elif self.tts_provider == "elevenlabs":
141
- if self.elevenlabs_key:
142
- yield self.log("Using custom ElevenLabs API key")
143
- else:
144
- yield self.log("Using demo ElevenLabs key")
145
  yield self.log("Tool Call: synthesize_podcast(...)")
146
  tts = get_tts_engine(
147
  tts_provider=self.tts_provider,
@@ -305,7 +298,7 @@ class PodcastAgent:
305
 
306
  # Add instruction for multi-paper script
307
  multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
308
- script = generator.generate_podcast_script(multi_paper_prompt)
309
 
310
  if not script:
311
  yield self.log("Error: Failed to generate script.")
@@ -319,13 +312,7 @@ class PodcastAgent:
319
  yield self.log(
320
  "\nThinking: The script looks good. Sending it to the TTS engine."
321
  )
322
- if self.tts_provider == "edge-tts":
323
- yield self.log("Using Edge-TTS (Microsoft, free)")
324
- elif self.tts_provider == "elevenlabs":
325
- if self.elevenlabs_key:
326
- yield self.log("Using custom ElevenLabs API key")
327
- else:
328
- yield self.log("Using demo ElevenLabs key")
329
  yield self.log("Tool Call: synthesize_podcast(...)")
330
  tts = get_tts_engine(
331
  tts_provider=self.tts_provider,
 
5
  from processing.url_fetcher import fetch_paper_from_url
6
  from synthesis.tts_engine import get_tts_engine
7
  from utils.config import (
 
 
 
 
 
8
  MAX_CONTEXT_CHARS,
9
  )
10
  from utils.history import save_to_history
 
13
  class PodcastAgent:
14
  def __init__(
15
  self,
16
+ provider_mode="own_inference",
17
  own_base_url=None,
18
  own_api_key=None,
19
  own_model=None,
20
  openai_key=None,
21
  openai_model=None,
22
+ tts_provider="elevenlabs",
23
  elevenlabs_key=None,
24
  host_voice=None,
25
  guest_voice=None,
26
  max_tokens=None,
27
+ target_dialogue_count=15,
28
  ):
29
+ """
30
+ Initialize PodcastAgent with user-provided settings (BYOK).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ Args:
33
+ provider_mode: "own_inference" or "openai"
34
+ own_base_url: Base URL for own inference server
35
+ own_api_key: API key for own inference server
36
+ own_model: Model name for own inference server
37
+ openai_key: OpenAI API key
38
+ openai_model: OpenAI model name
39
+ tts_provider: "elevenlabs" (ElevenLabs required)
40
+ elevenlabs_key: ElevenLabs API key (required)
41
+ host_voice: Voice ID for host
42
+ guest_voice: Voice ID for guest
43
+ max_tokens: Maximum tokens for generation
44
+ target_dialogue_count: Target number of dialogue exchanges (default: 15)
45
+ """
46
+ self.logs = []
47
+ self.provider_mode = provider_mode # "own_inference" or "openai"
48
+ self.own_base_url = own_base_url
49
+ self.own_api_key = own_api_key
50
+ self.own_model = own_model
51
+ self.openai_key = openai_key
52
+ self.openai_model = openai_model
53
+ self.tts_provider = tts_provider
54
+ self.elevenlabs_key = elevenlabs_key
55
+ self.host_voice = host_voice
56
+ self.guest_voice = guest_voice
57
  self.max_tokens = max_tokens
58
+ self.target_dialogue_count = target_dialogue_count
59
 
60
  def log(self, message):
61
  timestamp = time.strftime("%H:%M:%S")
 
126
  openai_model=self.openai_model,
127
  max_tokens=self.max_tokens,
128
  )
129
+ script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count)
130
  if not script:
131
  yield self.log("Error: Failed to generate script.")
132
  return None, self.logs
133
+ yield self.log(f"Generated script with {len(script)} dialogue turns (target: {self.target_dialogue_count}).")
134
 
135
  # Step 4: Synthesize Audio
136
  yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
137
+ yield self.log("Using ElevenLabs TTS")
 
 
 
 
 
 
138
  yield self.log("Tool Call: synthesize_podcast(...)")
139
  tts = get_tts_engine(
140
  tts_provider=self.tts_provider,
 
298
 
299
  # Add instruction for multi-paper script
300
  multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
301
+ script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count)
302
 
303
  if not script:
304
  yield self.log("Error: Failed to generate script.")
 
312
  yield self.log(
313
  "\nThinking: The script looks good. Sending it to the TTS engine."
314
  )
315
+ yield self.log("Using ElevenLabs TTS")
 
 
 
 
 
 
316
  yield self.log("Tool Call: synthesize_podcast(...)")
317
  tts = get_tts_engine(
318
  tts_provider=self.tts_provider,
app.py CHANGED
@@ -4,13 +4,8 @@ from datetime import datetime
4
  import gradio as gr
5
 
6
  from agents.podcast_agent import PodcastAgent
7
- from synthesis.tts_engine import EDGE_TTS_VOICES, ELEVENLABS_VOICES
8
  from utils.config import (
9
- DEMO_INFERENCE_KEY,
10
- DEMO_INFERENCE_URL,
11
- DEMO_MODE,
12
- DEMO_MODEL,
13
- DEMO_TTS_KEY,
14
  OUTPUT_DIR,
15
  SCRIPT_GENERATION_MODEL,
16
  )
@@ -20,19 +15,39 @@ from utils.history import get_history_items, load_history
20
  os.makedirs(OUTPUT_DIR, exist_ok=True)
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def validate_settings_for_generation(
24
  llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key
25
  ):
26
  """
27
- Validate user settings for podcast generation in non-demo mode.
28
 
29
  Returns:
30
  tuple: (is_valid, error_message)
31
  """
32
- # Skip validation if in demo mode
33
- if DEMO_MODE:
34
- return True, ""
35
-
36
  errors = []
37
 
38
  # Validate LLM settings
@@ -52,13 +67,11 @@ def validate_settings_for_generation(
52
  elif not openai_key.startswith("sk-"):
53
  errors.append("❌ **OpenAI**: API key must start with 'sk-'")
54
 
55
- # Validate TTS settings
56
- if tts_provider == "elevenlabs":
57
- if not elevenlabs_key:
58
- errors.append("❌ **ElevenLabs**: API key is required")
59
- elif not elevenlabs_key.startswith("sk_"):
60
- errors.append("❌ **ElevenLabs**: API key must start with 'sk_'")
61
- # Edge-TTS doesn't require any validation (it's free)
62
 
63
  if errors:
64
  return False, "\n".join(errors)
@@ -189,16 +202,17 @@ def run_agent(
189
  user_podcast_length,
190
  progress=gr.Progress(),
191
  ):
192
- """Run podcast generation with optional user settings"""
193
 
194
  # Determine provider mode
195
- if DEMO_MODE:
196
- provider_mode = "demo"
197
- elif user_llm_choice == "Own Inference":
198
  provider_mode = "own_inference"
199
  else: # OpenAI
200
  provider_mode = "openai"
201
 
 
 
 
202
  agent = PodcastAgent(
203
  provider_mode=provider_mode,
204
  own_base_url=user_own_base_url if user_own_base_url else None,
@@ -206,34 +220,23 @@ def run_agent(
206
  own_model=user_own_model if user_own_model else None,
207
  openai_key=user_openai_key if user_openai_key else None,
208
  openai_model=user_openai_model if user_openai_model else None,
209
- tts_provider=user_tts_provider if user_tts_provider else "edge-tts",
210
  elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
211
  host_voice=user_host_voice if user_host_voice else None,
212
  guest_voice=user_guest_voice if user_guest_voice else None,
213
- max_tokens=user_podcast_length if user_podcast_length else 4096,
 
214
  )
215
  logs_history = ""
216
 
217
  # Log settings being used
218
  settings_log = "Settings: "
219
- if provider_mode == "demo":
220
- settings_log += "LLM: Demo Inference | TTS: Edge-TTS (Microsoft) | "
221
- elif provider_mode == "own_inference":
222
  settings_log += f"LLM: Own Inference | "
223
- if user_tts_provider == "edge-tts":
224
- settings_log += "TTS: Edge-TTS (Microsoft) | "
225
- elif user_elevenlabs_key:
226
- settings_log += "TTS: Custom ElevenLabs | "
227
- else:
228
- settings_log += "TTS: ElevenLabs (no key provided) | "
229
  else: # openai
230
  settings_log += f"LLM: OpenAI ({user_openai_model or 'gpt-4o-mini'}) | "
231
- if user_tts_provider == "edge-tts":
232
- settings_log += "TTS: Edge-TTS (Microsoft) | "
233
- elif user_elevenlabs_key:
234
- settings_log += "TTS: Custom ElevenLabs | "
235
- else:
236
- settings_log += "TTS: ElevenLabs (no key provided) | "
237
 
238
  settings_log += (
239
  f"Length: {user_podcast_length if user_podcast_length else 4096} tokens"
@@ -385,34 +388,23 @@ def main():
385
  )
386
 
387
  with gr.Blocks(title="PaperCast", theme=theme) as demo:
388
- # Session state for settings
389
- if DEMO_MODE:
390
- user_llm_choice = gr.State(value="demo")
391
- user_own_base_url = gr.State(value=DEMO_INFERENCE_URL)
392
- user_own_api_key = gr.State(value=DEMO_INFERENCE_KEY)
393
- user_own_model = gr.State(value=DEMO_MODEL)
394
- user_openai_key = gr.State(value="")
395
- user_openai_model = gr.State(value="")
396
- user_tts_provider = gr.State(value="edge-tts")
397
- user_elevenlabs_key = gr.State(value="")
398
- user_host_voice = gr.State(value="en-US-GuyNeural")
399
- user_guest_voice = gr.State(value="en-US-JennyNeural")
400
- else:
401
- user_llm_choice = gr.State(value="Own Inference")
402
- user_own_base_url = gr.State(value="")
403
- user_own_api_key = gr.State(value="")
404
- user_own_model = gr.State(value="")
405
- user_openai_key = gr.State(value="")
406
- user_openai_model = gr.State(value="")
407
- user_tts_provider = gr.State(value="edge-tts")
408
- user_elevenlabs_key = gr.State(value="")
409
- user_host_voice = gr.State(value="en-US-GuyNeural")
410
- user_guest_voice = gr.State(value="en-US-JennyNeural")
411
  user_podcast_length = gr.State(value=4096)
412
- settings_valid = gr.State(value=DEMO_MODE) # Settings are valid in demo mode
413
 
414
- # Initialize generate button state based on demo mode
415
- generate_btn_state = gr.State(value=DEMO_MODE)
416
 
417
  with gr.Row():
418
  gr.HTML("""
@@ -718,27 +710,22 @@ Configure your PaperCast experience with your own API keys and preferences.
718
  )
719
 
720
  with gr.Group():
721
- if DEMO_MODE:
722
- gr.Markdown(
723
- "**🔧 Demo Mode Active** - Using built-in inference and TTS services"
724
- )
725
- else:
726
- llm_choice = gr.Radio(
727
- choices=[
728
- "Own Inference",
729
- "OpenAI",
730
- ],
731
- value="Own Inference",
732
- label="Language Model Provider",
733
- info="Choose your language model provider for script generation",
734
- )
735
 
736
  # Own Inference inputs (base URL + API key)
737
  own_inference_base_url = gr.Textbox(
738
  label="Base URL",
739
  placeholder="https://your-server.com/v1",
740
  info="OpenAI-compatible endpoint",
741
- visible=not DEMO_MODE,
742
  )
743
 
744
  own_inference_api_key = gr.Textbox(
@@ -746,14 +733,14 @@ Configure your PaperCast experience with your own API keys and preferences.
746
  placeholder="Optional - leave empty if not required",
747
  type="password",
748
  info="API key for your inference server (if required)",
749
- visible=not DEMO_MODE,
750
  )
751
 
752
  own_inference_model = gr.Textbox(
753
  label="Model Name",
754
  placeholder="e.g., llama-3.1-8b, mistral-7b",
755
  info="Model name on your server",
756
- visible=not DEMO_MODE,
757
  )
758
 
759
  # OpenAI inputs
@@ -762,7 +749,7 @@ Configure your PaperCast experience with your own API keys and preferences.
762
  placeholder="sk-...",
763
  type="password",
764
  info="Required when using OpenAI",
765
- visible=False, # Hidden by default, shown only when OpenAI is selected
766
  )
767
 
768
  openai_model_input = gr.Textbox(
@@ -770,149 +757,92 @@ Configure your PaperCast experience with your own API keys and preferences.
770
  placeholder="gpt-4o-mini",
771
  value="gpt-4o-mini",
772
  info="Model name (e.g., gpt-4o-mini, gpt-4, gpt-3.5-turbo)",
773
- visible=False, # Hidden by default, shown only when OpenAI is selected
774
  )
775
 
776
  gr.Markdown("---")
777
 
778
  gr.Markdown("## 🔊 Text-to-Speech (TTS)")
779
- if DEMO_MODE:
780
- gr.Markdown(
781
- "**🔧 Demo Mode Active** - Using Edge-TTS (Microsoft, free)"
782
- )
783
- else:
784
- gr.Markdown(
785
- "Choose your TTS provider for audio generation"
786
- )
787
 
788
  with gr.Group():
789
- tts_provider_choice = gr.Radio(
790
- choices=[
791
- "Edge-TTS (Free, Microsoft)",
792
- "ElevenLabs (Paid, Better Quality)",
793
- ],
794
- value="Edge-TTS (Free, Microsoft)",
795
- label="TTS Provider",
796
- info="Edge-TTS is free and works without API key. ElevenLabs offers better voice quality.",
797
- visible=not DEMO_MODE,
798
- )
799
-
800
  elevenlabs_key_input = gr.Textbox(
801
  label="ElevenLabs API Key",
802
- placeholder="sk_... (required for ElevenLabs)",
803
  type="password",
804
- info="Get your key at: elevenlabs.io",
805
- visible=False, # Hidden by default since Edge-TTS is default
 
806
  )
807
 
808
  gr.Markdown("### 🎭 Voice Selection")
809
- if DEMO_MODE:
810
- gr.Markdown("*Choose voices for your podcast (Demo mode uses Edge-TTS)*")
811
-
812
- # Edge-TTS voice selections
813
- with gr.Group(visible=True if DEMO_MODE else not DEMO_MODE) as edge_voice_group:
814
- edge_host_voice = gr.Dropdown(
815
- choices=list(EDGE_TTS_VOICES.keys()),
816
- value="Guy (US Male - Casual)",
817
- label="Host Voice (Edge-TTS)",
818
  info="Select voice for the podcast host",
819
  )
820
- edge_guest_voice = gr.Dropdown(
821
- choices=list(EDGE_TTS_VOICES.keys()),
822
- value="Jenny (US Female - Friendly)",
823
- label="Guest Voice (Edge-TTS)",
824
  info="Select voice for the expert guest",
825
  )
826
 
827
- # ElevenLabs voice selections (hidden by default, hidden in demo mode)
828
- if not DEMO_MODE:
829
- with gr.Group(visible=False) as elevenlabs_voice_group:
830
- elevenlabs_host_voice = gr.Dropdown(
831
- choices=list(ELEVENLABS_VOICES.keys()),
832
- value="Antoni (Male - Well-rounded)",
833
- label="Host Voice (ElevenLabs)",
834
- info="Select voice for the podcast host",
835
- )
836
- elevenlabs_guest_voice = gr.Dropdown(
837
- choices=list(ELEVENLABS_VOICES.keys()),
838
- value="Bella (Female - Soft)",
839
- label="Guest Voice (ElevenLabs)",
840
- info="Select voice for the expert guest",
841
- )
842
- else:
843
- # Create dummy components for demo mode so we can reference them
844
- elevenlabs_voice_group = None
845
- elevenlabs_host_voice = gr.State(value="Antoni (Male - Well-rounded)")
846
- elevenlabs_guest_voice = gr.State(value="Bella (Female - Soft)")
847
-
848
- # Toggle visibility based on LLM choice (only when not in demo mode)
849
- if not DEMO_MODE:
850
-
851
- def toggle_llm_inputs(choice):
852
- if choice == "Own Inference":
853
- return {
854
- own_inference_base_url: gr.update(visible=True),
855
- own_inference_api_key: gr.update(visible=True),
856
- own_inference_model: gr.update(visible=True),
857
- openai_key_input: gr.update(visible=False),
858
- openai_model_input: gr.update(visible=False),
859
- }
860
- elif choice == "OpenAI":
861
- return {
862
- own_inference_base_url: gr.update(
863
- visible=False
864
- ),
865
- own_inference_api_key: gr.update(visible=False),
866
- own_inference_model: gr.update(visible=False),
867
- openai_key_input: gr.update(visible=True),
868
- openai_model_input: gr.update(visible=True),
869
- }
870
-
871
- llm_choice.change(
872
- fn=toggle_llm_inputs,
873
- inputs=[llm_choice],
874
- outputs=[
875
- own_inference_base_url,
876
- own_inference_api_key,
877
- own_inference_model,
878
- openai_key_input,
879
- openai_model_input,
880
- ],
881
- )
882
-
883
- # Toggle visibility based on TTS provider choice
884
- def toggle_tts_inputs(choice):
885
- if choice == "Edge-TTS (Free, Microsoft)":
886
- return {
887
- elevenlabs_key_input: gr.update(visible=False),
888
- edge_voice_group: gr.update(visible=True),
889
- elevenlabs_voice_group: gr.update(visible=False),
890
- }
891
- else: # ElevenLabs
892
- return {
893
- elevenlabs_key_input: gr.update(visible=True),
894
- edge_voice_group: gr.update(visible=False),
895
- elevenlabs_voice_group: gr.update(visible=True),
896
- }
897
-
898
- tts_provider_choice.change(
899
- fn=toggle_tts_inputs,
900
- inputs=[tts_provider_choice],
901
- outputs=[elevenlabs_key_input, edge_voice_group, elevenlabs_voice_group],
902
- )
903
 
904
  gr.Markdown("---")
905
 
906
  gr.Markdown("## 🎚️ Podcast Settings")
907
 
908
  with gr.Group():
909
- podcast_length = gr.Slider(
910
- minimum=1000,
911
- maximum=8000,
912
- value=4096,
913
- step=500,
914
- label="Podcast Length (Max Tokens)",
915
- info="Higher values = longer podcasts",
 
 
 
 
 
916
  )
917
 
918
  gr.Markdown("---")
@@ -929,38 +859,27 @@ Configure your PaperCast experience with your own API keys and preferences.
929
  own_model,
930
  openai_key,
931
  openai_model,
932
- tts_provider,
933
  elevenlabs_key,
934
- edge_host,
935
- edge_guest,
936
  elevenlabs_host,
937
  elevenlabs_guest,
938
  length,
939
  ):
940
  status = "✅ **Settings Saved!**\n\n"
941
 
942
- # Convert TTS provider choice to internal format
943
- if tts_provider == "Edge-TTS (Free, Microsoft)":
944
- tts_provider_internal = "edge-tts"
945
- else:
946
- tts_provider_internal = "elevenlabs"
947
-
948
- # Validate settings first (only in non-demo mode)
949
  is_valid, validation_message = (
950
  validate_settings_for_generation(
951
  llm_choice,
952
  own_base_url,
953
  own_api_key,
954
  openai_key,
955
- tts_provider_internal,
956
  elevenlabs_key,
957
  )
958
  )
959
 
960
  # LLM Settings
961
- if DEMO_MODE:
962
- status += "- LLM: Demo Inference ✓\n"
963
- elif llm_choice == "Own Inference":
964
  if own_base_url:
965
  status += f"- LLM: Own Inference ✓\n"
966
  status += f" - URL: {own_base_url[:50]}...\n"
@@ -974,114 +893,78 @@ Configure your PaperCast experience with your own API keys and preferences.
974
  status += "- ⚠️ LLM: OpenAI selected but no API key provided\n"
975
 
976
  # TTS Settings
977
- if DEMO_MODE:
978
- status += "- TTS: Edge-TTS (Microsoft, free) ✓\n"
979
  else:
980
- if tts_provider_internal == "edge-tts":
981
- status += "- TTS: Edge-TTS (Microsoft, free) ✓\n"
982
- elif elevenlabs_key:
983
- status += "- TTS: ElevenLabs (Custom key) ✓\n"
984
- else:
985
- status += "- ⚠️ TTS: ElevenLabs key required\n"
986
 
987
  # Add validation result
988
- if not DEMO_MODE:
989
- if is_valid:
990
- status += "\n✅ **All settings are valid!**\n"
991
- status += "🎉 Generate button is now enabled.\n"
992
- else:
993
- status += "\n⚠️ **Settings incomplete!**\n"
994
- status += "🚫 Generate button remains disabled.\n"
995
- status += f"\nRequired fixes:\n{validation_message}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
996
 
997
- status += f"\n- Podcast Length: {int(length)} tokens\n"
998
  status += (
999
  "\n*Settings will be used for next podcast generation.*"
1000
  )
1001
 
1002
- # Determine which voices to use based on TTS provider
1003
- if tts_provider_internal == "edge-tts":
1004
- host_voice = EDGE_TTS_VOICES.get(edge_host, "en-US-GuyNeural")
1005
- guest_voice = EDGE_TTS_VOICES.get(edge_guest, "en-US-JennyNeural")
1006
- else: # elevenlabs
1007
- host_voice = ELEVENLABS_VOICES.get(elevenlabs_host, "ErXwobaYiN019PkySvjV")
1008
- guest_voice = ELEVENLABS_VOICES.get(elevenlabs_guest, "EXAVITQu4vr4xnSDxMaL")
1009
 
 
1010
  return (
1011
  status,
1012
- llm_choice if not DEMO_MODE else "demo",
1013
- own_base_url if not DEMO_MODE else DEMO_INFERENCE_URL,
1014
- own_api_key if not DEMO_MODE else DEMO_INFERENCE_KEY,
1015
- own_model if not DEMO_MODE else DEMO_MODEL,
1016
  openai_key,
1017
  openai_model,
1018
- tts_provider_internal if not DEMO_MODE else "edge-tts",
1019
- elevenlabs_key if not DEMO_MODE else "",
1020
- host_voice if not DEMO_MODE else "en-US-GuyNeural",
1021
- guest_voice if not DEMO_MODE else "en-US-JennyNeural",
1022
- int(length),
1023
  is_valid,
1024
  )
1025
 
1026
- if DEMO_MODE:
1027
- # In demo mode, settings are pre-configured but voices can be customized
1028
- def save_demo_settings(edge_host, edge_guest, length):
1029
- host_voice = EDGE_TTS_VOICES.get(edge_host, "en-US-GuyNeural")
1030
- guest_voice = EDGE_TTS_VOICES.get(edge_guest, "en-US-JennyNeural")
1031
-
1032
- return (
1033
- f"✅ **Settings Saved!**\n\n- LLM: Demo Inference ✓\n- TTS: Edge-TTS (Microsoft, free) ✓\n- Host Voice: {edge_host}\n- Guest Voice: {edge_guest}\n\n*Demo mode is active with built-in services.*",
1034
- "demo",
1035
- DEMO_INFERENCE_URL,
1036
- DEMO_INFERENCE_KEY,
1037
- DEMO_MODEL,
1038
- "",
1039
- "",
1040
- "edge-tts",
1041
- "",
1042
- host_voice,
1043
- guest_voice,
1044
- int(length),
1045
- True, # settings_valid = True in demo mode
1046
- )
1047
-
1048
- save_settings_btn.click(
1049
- fn=save_demo_settings,
1050
- inputs=[edge_host_voice, edge_guest_voice, podcast_length],
1051
- outputs=[
1052
- settings_status,
1053
- user_llm_choice,
1054
- user_own_base_url,
1055
- user_own_api_key,
1056
- user_own_model,
1057
- user_openai_key,
1058
- user_openai_model,
1059
- user_tts_provider,
1060
- user_elevenlabs_key,
1061
- user_host_voice,
1062
- user_guest_voice,
1063
- user_podcast_length,
1064
- settings_valid,
1065
- ],
1066
- )
1067
- else:
1068
- save_settings_btn.click(
1069
- fn=save_settings,
1070
- inputs=[
1071
- llm_choice,
1072
- own_inference_base_url,
1073
- own_inference_api_key,
1074
- own_inference_model,
1075
- openai_key_input,
1076
- openai_model_input,
1077
- tts_provider_choice,
1078
- elevenlabs_key_input,
1079
- edge_host_voice,
1080
- edge_guest_voice,
1081
- elevenlabs_host_voice,
1082
- elevenlabs_guest_voice,
1083
- podcast_length,
1084
- ],
1085
  outputs=[
1086
  settings_status,
1087
  user_llm_choice,
@@ -1113,73 +996,149 @@ Configure your PaperCast experience with your own API keys and preferences.
1113
 
1114
  # About PaperCast
1115
 
1116
- **PaperCast** is an AI-powered application that transforms complex research papers into engaging, accessible audio podcasts.
1117
- Making scientific knowledge more accessible, one paper at a time.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
 
1119
  ---
1120
 
1121
  ## 🎯 How It Works
1122
 
1123
- Our intelligent agent orchestrates a multi-step pipeline to create your podcast:
1124
 
1125
- 1. **📥 Input** - Provide a paper URL (arXiv, medRxiv) or upload any PDF
1126
- 2. **📄 Extraction** - AI extracts and analyzes the paper content
1127
- 3. **🎬 Script Generation** - Creates natural dialogue between Host and Expert personas
1128
- 4. **🎤 Voice Synthesis** - Generates high-quality audio with distinct voices
1129
- 5. **✅ Delivery** - Your podcast is ready to listen and download
1130
 
1131
  ---
1132
 
1133
  ## 🌟 Key Features
1134
 
1135
- **Multiple Sources**: Support for arXiv, medRxiv, and direct PDF uploads
1136
- **Natural Dialogue**: Engaging conversation between Host and Expert characters
1137
- **High-Quality Audio**: Professional voice synthesis powered by ElevenLabs
1138
- **Smart Processing**: AI understands paper structure and creates contextual discussions
1139
- **History Tracking**: Keep track of all your generated podcasts
 
 
1140
 
1141
  ---
1142
 
1143
  ## 🔧 Technology Stack
1144
 
1145
  **LLM**: {SCRIPT_GENERATION_MODEL}
1146
- **TTS**: Edge-TTS (Microsoft, Free) / ElevenLabs API (Optional)
 
 
1147
  **Infrastructure**: ☁️ Remote Inference
1148
- **Framework**: Gradio 6
1149
- **PDF Processing**: PyMuPDF
1150
 
1151
  ---
1152
 
1153
  ## 🎓 Built For
1154
 
1155
  **MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
 
1156
 
1157
- This project demonstrates autonomous agent capabilities through intelligent orchestration
1158
- of multiple AI tools to transform static research papers into dynamic audio content.
 
 
 
1159
 
1160
  ---
1161
 
1162
  ## 📝 About the Agent
1163
 
1164
- PaperCast uses an autonomous agent that:
1165
 
1166
- **Plans** conversation flow based on paper structure
1167
- **Reasons** about which concepts need simplification
1168
- **Executes** multi-step processing pipeline
1169
- **Adapts** dialogue based on paper complexity
 
1170
 
1171
  ---
1172
 
1173
  ## 💡 Use Cases
1174
 
1175
- 🎧 Listen to papers during commute or exercise
1176
- 📚 Quick overview of research before deep reading
1177
- 🌍 Make research accessible to broader audiences
1178
- 🔬 Stay updated with latest papers in your field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179
 
1180
  ---
1181
 
1182
- Made with ❤️ using AI, Gradio, and ElevenLabs
1183
 
1184
  </div>
1185
  """)
 
4
  import gradio as gr
5
 
6
  from agents.podcast_agent import PodcastAgent
7
+ from synthesis.tts_engine import ELEVENLABS_VOICES
8
  from utils.config import (
 
 
 
 
 
9
  OUTPUT_DIR,
10
  SCRIPT_GENERATION_MODEL,
11
  )
 
15
  os.makedirs(OUTPUT_DIR, exist_ok=True)
16
 
17
 
18
+ # Podcast length presets: maps UI choice to (target_exchanges, max_tokens)
19
+ PODCAST_LENGTH_PRESETS = {
20
+ "⚡ Very Short (6-8 exchanges, ~2-3 min)": (7, 2000),
21
+ "📝 Short (10-12 exchanges, ~3-4 min)": (11, 3000),
22
+ "📄 Medium (14-16 exchanges, ~5-6 min)": (15, 4000),
23
+ "📚 Medium-Long (18-20 exchanges, ~7-8 min)": (19, 5000),
24
+ "📖 Long (22-25 exchanges, ~9-11 min)": (23, 6000),
25
+ "📕 Very Long (28-32 exchanges, ~12-15 min)": (30, 8000),
26
+ }
27
+
28
+
29
+ def get_podcast_length_params(length_choice):
30
+ """
31
+ Convert podcast length choice to parameters.
32
+
33
+ Returns:
34
+ tuple: (target_dialogue_count, max_tokens)
35
+ """
36
+ return PODCAST_LENGTH_PRESETS.get(
37
+ length_choice,
38
+ (15, 4000) # Default to Medium
39
+ )
40
+
41
+
42
  def validate_settings_for_generation(
43
  llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key
44
  ):
45
  """
46
+ Validate user settings for podcast generation (BYOK - Bring Your Own Key).
47
 
48
  Returns:
49
  tuple: (is_valid, error_message)
50
  """
 
 
 
 
51
  errors = []
52
 
53
  # Validate LLM settings
 
67
  elif not openai_key.startswith("sk-"):
68
  errors.append("❌ **OpenAI**: API key must start with 'sk-'")
69
 
70
+ # Validate TTS settings (ElevenLabs required)
71
+ if not elevenlabs_key:
72
+ errors.append("❌ **ElevenLabs TTS**: API key is required")
73
+ elif not elevenlabs_key.startswith("sk_"):
74
+ errors.append("❌ **ElevenLabs TTS**: API key must start with 'sk_'")
 
 
75
 
76
  if errors:
77
  return False, "\n".join(errors)
 
202
  user_podcast_length,
203
  progress=gr.Progress(),
204
  ):
205
+ """Run podcast generation with user settings (BYOK)"""
206
 
207
  # Determine provider mode
208
+ if user_llm_choice == "Own Inference":
 
 
209
  provider_mode = "own_inference"
210
  else: # OpenAI
211
  provider_mode = "openai"
212
 
213
+ # Parse podcast length settings
214
+ target_exchanges, max_tokens = get_podcast_length_params(user_podcast_length)
215
+
216
  agent = PodcastAgent(
217
  provider_mode=provider_mode,
218
  own_base_url=user_own_base_url if user_own_base_url else None,
 
220
  own_model=user_own_model if user_own_model else None,
221
  openai_key=user_openai_key if user_openai_key else None,
222
  openai_model=user_openai_model if user_openai_model else None,
223
+ tts_provider=user_tts_provider if user_tts_provider else "elevenlabs",
224
  elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
225
  host_voice=user_host_voice if user_host_voice else None,
226
  guest_voice=user_guest_voice if user_guest_voice else None,
227
+ max_tokens=max_tokens,
228
+ target_dialogue_count=target_exchanges,
229
  )
230
  logs_history = ""
231
 
232
  # Log settings being used
233
  settings_log = "Settings: "
234
+ if provider_mode == "own_inference":
 
 
235
  settings_log += f"LLM: Own Inference | "
236
+ settings_log += "TTS: ElevenLabs | "
 
 
 
 
 
237
  else: # openai
238
  settings_log += f"LLM: OpenAI ({user_openai_model or 'gpt-4o-mini'}) | "
239
+ settings_log += "TTS: ElevenLabs | "
 
 
 
 
 
240
 
241
  settings_log += (
242
  f"Length: {user_podcast_length if user_podcast_length else 4096} tokens"
 
388
  )
389
 
390
  with gr.Blocks(title="PaperCast", theme=theme) as demo:
391
+ # Session state for settings (BYOK - Bring Your Own Key)
392
+ # NOTE: Settings are session-only for security (multi-user HF Spaces)
393
+ user_llm_choice = gr.State(value="Own Inference")
394
+ user_own_base_url = gr.State(value="")
395
+ user_own_api_key = gr.State(value="")
396
+ user_own_model = gr.State(value="")
397
+ user_openai_key = gr.State(value="")
398
+ user_openai_model = gr.State(value="")
399
+ user_tts_provider = gr.State(value="elevenlabs")
400
+ user_elevenlabs_key = gr.State(value="")
401
+ user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # Antoni
402
+ user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # Bella
 
 
 
 
 
 
 
 
 
 
 
403
  user_podcast_length = gr.State(value=4096)
404
+ settings_valid = gr.State(value=False) # Users must configure settings
405
 
406
+ # Initialize generate button state
407
+ generate_btn_state = gr.State(value=False)
408
 
409
  with gr.Row():
410
  gr.HTML("""
 
710
  )
711
 
712
  with gr.Group():
713
+ llm_choice = gr.Radio(
714
+ choices=[
715
+ "Own Inference",
716
+ "OpenAI",
717
+ ],
718
+ value="Own Inference",
719
+ label="Language Model Provider",
720
+ info="Choose your language model provider for script generation",
721
+ )
 
 
 
 
 
722
 
723
  # Own Inference inputs (base URL + API key)
724
  own_inference_base_url = gr.Textbox(
725
  label="Base URL",
726
  placeholder="https://your-server.com/v1",
727
  info="OpenAI-compatible endpoint",
728
+ visible=True,
729
  )
730
 
731
  own_inference_api_key = gr.Textbox(
 
733
  placeholder="Optional - leave empty if not required",
734
  type="password",
735
  info="API key for your inference server (if required)",
736
+ visible=True,
737
  )
738
 
739
  own_inference_model = gr.Textbox(
740
  label="Model Name",
741
  placeholder="e.g., llama-3.1-8b, mistral-7b",
742
  info="Model name on your server",
743
+ visible=True,
744
  )
745
 
746
  # OpenAI inputs
 
749
  placeholder="sk-...",
750
  type="password",
751
  info="Required when using OpenAI",
752
+ visible=False, # Hidden by default
753
  )
754
 
755
  openai_model_input = gr.Textbox(
 
757
  placeholder="gpt-4o-mini",
758
  value="gpt-4o-mini",
759
  info="Model name (e.g., gpt-4o-mini, gpt-4, gpt-3.5-turbo)",
760
+ visible=False, # Hidden by default
761
  )
762
 
763
  gr.Markdown("---")
764
 
765
  gr.Markdown("## 🔊 Text-to-Speech (TTS)")
766
+ gr.Markdown(
767
+ "Powered by ElevenLabs - Premium AI voice synthesis"
768
+ )
 
 
 
 
 
769
 
770
  with gr.Group():
 
 
 
 
 
 
 
 
 
 
 
771
  elevenlabs_key_input = gr.Textbox(
772
  label="ElevenLabs API Key",
773
+ placeholder="sk_...",
774
  type="password",
775
+ info="Get your key at: elevenlabs.io (Required)",
776
+ value="",
777
+ visible=True,
778
  )
779
 
780
  gr.Markdown("### 🎭 Voice Selection")
781
+
782
+ with gr.Group(visible=True) as elevenlabs_voice_group:
783
+ elevenlabs_host_voice = gr.Dropdown(
784
+ choices=list(ELEVENLABS_VOICES.keys()),
785
+ value="Antoni (Male - Well-rounded)",
786
+ label="Host Voice",
 
 
 
787
  info="Select voice for the podcast host",
788
  )
789
+ elevenlabs_guest_voice = gr.Dropdown(
790
+ choices=list(ELEVENLABS_VOICES.keys()),
791
+ value="Bella (Female - Soft)",
792
+ label="Guest Voice",
793
  info="Select voice for the expert guest",
794
  )
795
 
796
+ # Toggle visibility based on LLM choice
797
+ def toggle_llm_inputs(choice):
798
+ if choice == "Own Inference":
799
+ return {
800
+ own_inference_base_url: gr.update(visible=True),
801
+ own_inference_api_key: gr.update(visible=True),
802
+ own_inference_model: gr.update(visible=True),
803
+ openai_key_input: gr.update(visible=False),
804
+ openai_model_input: gr.update(visible=False),
805
+ }
806
+ elif choice == "OpenAI":
807
+ return {
808
+ own_inference_base_url: gr.update(
809
+ visible=False
810
+ ),
811
+ own_inference_api_key: gr.update(visible=False),
812
+ own_inference_model: gr.update(visible=False),
813
+ openai_key_input: gr.update(visible=True),
814
+ openai_model_input: gr.update(visible=True),
815
+ }
816
+
817
+ llm_choice.change(
818
+ fn=toggle_llm_inputs,
819
+ inputs=[llm_choice],
820
+ outputs=[
821
+ own_inference_base_url,
822
+ own_inference_api_key,
823
+ own_inference_model,
824
+ openai_key_input,
825
+ openai_model_input,
826
+ ],
827
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
 
829
  gr.Markdown("---")
830
 
831
  gr.Markdown("## 🎚️ Podcast Settings")
832
 
833
  with gr.Group():
834
+ podcast_length = gr.Radio(
835
+ choices=[
836
+ "⚡ Very Short (6-8 exchanges, ~2-3 min)",
837
+ "📝 Short (10-12 exchanges, ~3-4 min)",
838
+ "📄 Medium (14-16 exchanges, ~5-6 min)",
839
+ "📚 Medium-Long (18-20 exchanges, ~7-8 min)",
840
+ "📖 Long (22-25 exchanges, ~9-11 min)",
841
+ "📕 Very Long (28-32 exchanges, ~12-15 min)",
842
+ ],
843
+ value="📄 Medium (14-16 exchanges, ~5-6 min)",
844
+ label="Podcast Length",
845
+ info="Select desired podcast duration based on dialogue exchanges",
846
  )
847
 
848
  gr.Markdown("---")
 
859
  own_model,
860
  openai_key,
861
  openai_model,
 
862
  elevenlabs_key,
 
 
863
  elevenlabs_host,
864
  elevenlabs_guest,
865
  length,
866
  ):
867
  status = "✅ **Settings Saved!**\n\n"
868
 
869
+ # Validate settings
 
 
 
 
 
 
870
  is_valid, validation_message = (
871
  validate_settings_for_generation(
872
  llm_choice,
873
  own_base_url,
874
  own_api_key,
875
  openai_key,
876
+ "elevenlabs", # Always ElevenLabs
877
  elevenlabs_key,
878
  )
879
  )
880
 
881
  # LLM Settings
882
+ if llm_choice == "Own Inference":
 
 
883
  if own_base_url:
884
  status += f"- LLM: Own Inference ✓\n"
885
  status += f" - URL: {own_base_url[:50]}...\n"
 
893
  status += "- ⚠️ LLM: OpenAI selected but no API key provided\n"
894
 
895
  # TTS Settings
896
+ if elevenlabs_key:
897
+ status += "- TTS: ElevenLabs ✓\n"
898
  else:
899
+ status += "- ⚠️ TTS: ElevenLabs API key required\n"
 
 
 
 
 
900
 
901
  # Add validation result
902
+ if is_valid:
903
+ status += "\n✅ **All settings are valid!**\n"
904
+ status += "🎉 Generate button is now enabled.\n"
905
+ else:
906
+ status += "\n⚠️ **Settings incomplete!**\n"
907
+ status += "🚫 Generate button remains disabled.\n"
908
+ status += f"\nRequired fixes:\n{validation_message}"
909
+
910
+ # Parse podcast length
911
+ target_exchanges, max_tokens = get_podcast_length_params(length)
912
+ status += f"\n- Podcast Length: {length}\n"
913
+ status += f" - Target: {target_exchanges} dialogue exchanges\n"
914
+ status += f" - Max tokens: {max_tokens}\n"
915
+
916
+ # Add reasoning model info if using OpenAI reasoning models
917
+ if llm_choice == "OpenAI" and openai_model:
918
+ model_lower = openai_model.lower()
919
+ # Check if it's a reasoning model
920
+ is_reasoning = any(
921
+ keyword in model_lower
922
+ for keyword in ["gpt-5", "o1", "o3", "o4"]
923
+ ) and "chat" not in model_lower
924
+
925
+ if is_reasoning:
926
+ total_tokens = max_tokens * 2
927
+ status += f" - ⚡ Reasoning model: {max_tokens} × 2 = {total_tokens} max tokens\n"
928
 
 
929
  status += (
930
  "\n*Settings will be used for next podcast generation.*"
931
  )
932
 
933
+ # Get ElevenLabs voices
934
+ host_voice = ELEVENLABS_VOICES.get(elevenlabs_host, "ErXwobaYiN019PkySvjV")
935
+ guest_voice = ELEVENLABS_VOICES.get(elevenlabs_guest, "EXAVITQu4vr4xnSDxMaL")
 
 
 
 
936
 
937
+ # Settings are stored in gr.State() for session-only (no disk persistence for security)
938
  return (
939
  status,
940
+ llm_choice,
941
+ own_base_url,
942
+ own_api_key,
943
+ own_model,
944
  openai_key,
945
  openai_model,
946
+ "elevenlabs", # Always ElevenLabs
947
+ elevenlabs_key,
948
+ host_voice,
949
+ guest_voice,
950
+ length, # Now stores the full choice string
951
  is_valid,
952
  )
953
 
954
+ save_settings_btn.click(
955
+ fn=save_settings,
956
+ inputs=[
957
+ llm_choice,
958
+ own_inference_base_url,
959
+ own_inference_api_key,
960
+ own_inference_model,
961
+ openai_key_input,
962
+ openai_model_input,
963
+ elevenlabs_key_input,
964
+ elevenlabs_host_voice,
965
+ elevenlabs_guest_voice,
966
+ podcast_length,
967
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
968
  outputs=[
969
  settings_status,
970
  user_llm_choice,
 
996
 
997
  # About PaperCast
998
 
999
+ **The world's first interactive, multi-modal, AI-powered academic podcast studio.**
1000
+
1001
+ Transform any research paper into engaging audio conversations with our proprietary frameworks powered by MCP tools, Gradio 6, and ElevenLabs.
1002
+
1003
+ ---
1004
+
1005
+ ## 🚀 Revolutionary Frameworks
1006
+
1007
+ We built 4 original frameworks that redefine how people consume research:
1008
+
1009
+ ### **PPF** — Podcast Persona Framework
1010
+ Choose from 5 unique conversation styles:
1011
+ - 🤝 **Friendly Explainer** - Casual discussion between friends (default)
1012
+ - ⚔️ **Academic Debate** - Constructive challenges and defenses
1013
+ - 🔥 **Savage Roast** - Brutal critique meets stubborn defense
1014
+ - 🎓 **Pedagogical** - Professor teaching curious student
1015
+ - 🌐 **Interdisciplinary Clash** - Domain expert vs. complete outsider
1016
+
1017
+ ### **PAD** — Paper Auto-Discovery
1018
+ - 🔍 Natural language search: "diffusion survey 2025" or "Grok reasoning"
1019
+ - 📚 Semantic Scholar + arXiv API integration
1020
+ - 🎯 Zero-friction paper discovery
1021
+
1022
+ ### **PVF** — Paper Visual Framework *(Coming Soon)*
1023
+ - 📄 Synchronized PDF viewer with audio playback
1024
+ - 🎯 Auto-scroll to figures/tables when mentioned
1025
+ - ⏱️ Clickable timestamps in transcript
1026
+
1027
+ ### **CPM** — Counterfactual Paper Mode *(Coming Soon)*
1028
+ - 🤔 "What if this paper was written by Yann LeCun?"
1029
+ - 🕰️ "What if GPT-4 never existed?"
1030
+ - 🌀 Alternate reality interpretations
1031
 
1032
  ---
1033
 
1034
  ## 🎯 How It Works
1035
 
1036
+ Our intelligent agent orchestrates a multi-step pipeline:
1037
 
1038
+ 1. **📥 Input** - URL, PDF upload, or free-text search
1039
+ 2. **📄 Extraction** - Marker-pdf MCP extracts clean markdown with LaTeX
1040
+ 3. **🎬 Script Generation** - Claude creates persona-aware dialogue
1041
+ 4. **🎤 Voice Synthesis** - ElevenLabs generates premium audio
1042
+ 5. **✅ Delivery** - Listen, download, share
1043
 
1044
  ---
1045
 
1046
  ## 🌟 Key Features
1047
 
1048
+ **5 Persona Modes** - From friendly to savage
1049
+ 🔍 **Smart Paper Search** - Semantic Scholar + arXiv
1050
+ 🎙️ **Premium Audio** - ElevenLabs TTS (required)
1051
+ 📝 **Multi-format Export** - TXT, SRT, VTT, DOCX *(coming)*
1052
+ 🧠 **Agent Intelligence** - MCP-powered autonomous reasoning
1053
+ 📚 **History Tracking** - All podcasts saved locally
1054
+ ⚡ **Multi-paper Processing** - Batch generation support
1055
 
1056
  ---
1057
 
1058
  ## 🔧 Technology Stack
1059
 
1060
  **LLM**: {SCRIPT_GENERATION_MODEL}
1061
+ **TTS**: ElevenLabs (Premium AI Voice Synthesis)
1062
+ **PDF Processing**: Marker-pdf MCP Server
1063
+ **Search**: Semantic Scholar Graph API + arXiv API
1064
  **Infrastructure**: ☁️ Remote Inference
1065
+ **Framework**: Gradio 6 with MCP Integration
1066
+ **Agent**: Claude with Model Context Protocol
1067
 
1068
  ---
1069
 
1070
  ## 🎓 Built For
1071
 
1072
  **MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
1073
+ *Tag: `mcp-in-action-track-consumer`*
1074
 
1075
+ This project demonstrates:
1076
+ - 🤖 Autonomous agent planning and reasoning
1077
+ - 🔧 MCP tools as cognitive extensions
1078
+ - 🎨 Innovative UI/UX with Gradio 6
1079
+ - 🚀 Real-world impact on research accessibility
1080
 
1081
  ---
1082
 
1083
  ## 📝 About the Agent
1084
 
1085
+ PaperCast's autonomous agent:
1086
 
1087
+ - **Plans** - Analyzes paper structure and selects optimal conversation strategy
1088
+ - **Reasons** - Determines which concepts need simplification based on persona
1089
+ - **Executes** - Orchestrates MCP tools (Marker, Semantic Scholar, arXiv)
1090
+ - **Adapts** - Adjusts dialogue complexity and style per persona mode
1091
+ - **Discovers** - Intelligently searches and retrieves relevant papers
1092
 
1093
  ---
1094
 
1095
  ## 💡 Use Cases
1096
 
1097
+ 🎧 **Commute Learning** - Listen during travel or exercise
1098
+ 📚 **Quick Previews** - Overview before deep reading
1099
+ 🌍 **Accessibility** - Make research understandable for everyone
1100
+ 🔬 **Stay Current** - Keep up with latest papers effortlessly
1101
+ 🎭 **Entertainment** - Savage Roast mode for fun paper critiques
1102
+ 🤔 **What-If Scenarios** - Explore counterfactual interpretations
1103
+
1104
+ ---
1105
+
1106
+ ## 🏆 What Makes Us Different
1107
+
1108
+ **Not just another summarizer** - We invented the Podcast Persona Framework (PPF)
1109
+ **Visual sync** - Paper Visual Framework (PVF) connects audio to figures
1110
+ **Smart discovery** - Paper Auto-Discovery (PAD) finds papers via natural language
1111
+ **Counterfactuals** - Counterfactual Paper Mode (CPM) explores alternate realities
1112
+ **MCP Native** - Built from ground up with Model Context Protocol
1113
+
1114
+ ---
1115
+
1116
+ ## 🙏 Special Thanks
1117
+
1118
+ This project was made possible by the incredible support from:
1119
+
1120
+ <div style="display: flex; justify-content: center; align-items: center; gap: 80px; margin: 50px 0; flex-wrap: wrap;">
1121
+ <div style="text-align: center;">
1122
+ <a href="https://modal.com" target="_blank">
1123
+ <img src="https://images.prismic.io/contrary-research/aDnorSdWJ-7kSv6V_ModalLabs_Cover.png?auto=format,compress" alt="Modal" style="height: 140px; width: auto; display: block; margin: 0 auto;">
1124
+ </a>
1125
+ </div>
1126
+ <div style="text-align: center;">
1127
+ <a href="https://elevenlabs.io" target="_blank">
1128
+ <img src="https://eleven-public-cdn.elevenlabs.io/payloadcms/9trrmnj2sj8-logo-logo.svg" alt="ElevenLabs" style="height: 100px; width: auto; display: block; margin: 0 auto;">
1129
+ </a>
1130
+ </div>
1131
+ </div>
1132
+
1133
+ **Why we chose these partners:**
1134
+
1135
+ 🚀 **Modal** - Serverless AI infrastructure that gives us instant access to powerful GPUs (A100, H100) with sub-second cold starts. Their platform handles automatic scaling, letting us process papers efficiently without managing infrastructure. Perfect for variable workloads and rapid iteration.
1136
+
1137
+ 🎙️ **ElevenLabs** - We use their **Turbo v2.5** model for studio-quality voice synthesis. This model delivers incredibly natural, emotionally expressive voices with low latency (~250-300ms) and 50% lower cost. The voice quality makes our podcasts truly engaging and professional.
1138
 
1139
  ---
1140
 
1141
+ Made with ❤️ using Anthropic, OpenAI, Modal, ElevenLabs, Gradio, and MCP
1142
 
1143
  </div>
1144
  """)
generation/script_generator.py CHANGED
@@ -1,14 +1,9 @@
1
- import base64
2
  import json
3
 
4
  import httpx
5
  from openai import OpenAI
6
 
7
  from utils.config import (
8
- DEMO_INFERENCE_KEY,
9
- DEMO_INFERENCE_URL,
10
- DEMO_MODE,
11
- DEMO_MODEL,
12
  MAX_TOKENS,
13
  SCRIPT_GENERATION_MODEL,
14
  TEMPERATURE,
@@ -18,7 +13,7 @@ from utils.config import (
18
  class ScriptGenerator:
19
  def __init__(
20
  self,
21
- provider_mode="demo",
22
  own_base_url=None,
23
  own_api_key=None,
24
  own_model=None,
@@ -30,7 +25,7 @@ class ScriptGenerator:
30
  Initialize ScriptGenerator with flexible provider support.
31
 
32
  Args:
33
- provider_mode: "demo", "own_inference", or "openai"
34
  own_base_url: Base URL for own inference server
35
  own_api_key: API key for own inference server
36
  own_model: Model name for own inference server
@@ -41,20 +36,7 @@ class ScriptGenerator:
41
  self.provider_mode = provider_mode
42
  self.max_tokens = max_tokens or MAX_TOKENS
43
 
44
- if provider_mode == "demo":
45
- # Demo mode - use hardcoded credentials
46
- print(f"Using Demo Inference: {DEMO_INFERENCE_URL}")
47
- username, password = DEMO_INFERENCE_KEY.split(":", 1)
48
- http_client = httpx.Client(auth=(username, password))
49
- self.client = OpenAI(
50
- base_url=DEMO_INFERENCE_URL,
51
- api_key="dummy",
52
- http_client=http_client,
53
- )
54
- self.model_name = DEMO_MODEL
55
- print("✓ Demo inference client initialized")
56
-
57
- elif provider_mode == "own_inference":
58
  # Own inference server
59
  print(f"Connecting to own inference API: {own_base_url}")
60
 
@@ -94,18 +76,19 @@ class ScriptGenerator:
94
  else:
95
  raise ValueError(f"Invalid provider_mode: {provider_mode}")
96
 
97
- def generate_podcast_script(self, paper_text: str) -> list:
98
  """
99
  Generates a podcast script from the given paper text.
100
 
101
  Args:
102
  paper_text (str): The text content of the research paper.
 
103
 
104
  Returns:
105
  list: A list of dictionaries representing the dialogue.
106
  """
107
 
108
- system_prompt = """You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two hosts:
109
  - Host (Alex): Enthusiastic, asks clarifying questions, guides the conversation.
110
  - Guest (Jamie): Expert researcher, explains concepts simply but accurately.
111
 
@@ -113,6 +96,8 @@ CRITICAL RULES:
113
  1. The Host MUST ALWAYS start with "Welcome to PaperCast!" - This is the show's branding and must never be skipped.
114
  2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
115
  3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
 
 
116
 
117
  Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
118
  - "speaker": "Host" or "Guest"
@@ -121,14 +106,15 @@ Output the script in a valid JSON format. The JSON should be a list of objects,
121
 
122
  Example:
123
  [
124
- {"speaker": "Host", "text": "Welcome to PaperCast! Today we're diving into something really cool.", "emotion": "excited"},
125
- {"speaker": "Guest", "text": "That's right, Alex. We're looking at a new way to handle large language models.", "emotion": "happy"}
126
  ]
127
 
128
  Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
 
129
  """
130
 
131
- user_prompt = f"Here is the research paper text. Generate a podcast script summarizing the key findings, methodology, and implications.\n\n{paper_text[:10000]}..."
132
 
133
  messages = [
134
  {"role": "system", "content": system_prompt},
@@ -139,15 +125,81 @@ Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly
139
  f"Generating script with {self.provider_mode} (model: {self.model_name})..."
140
  )
141
 
142
- # Call LLM
143
- response = self.client.chat.completions.create(
144
- model=self.model_name,
145
- messages=messages,
146
- max_tokens=self.max_tokens,
147
- temperature=TEMPERATURE,
148
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  generated_text = response.choices[0].message.content
150
 
 
 
 
 
 
 
 
 
151
  # Extract JSON from the response
152
  try:
153
  # Find the first '[' and last ']'
@@ -156,13 +208,28 @@ Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly
156
  if start_index != -1 and end_index != -1:
157
  json_str = generated_text[start_index:end_index]
158
  script = json.loads(json_str)
 
159
  return script
160
  else:
161
- print("No JSON found in output.")
 
 
 
162
  return []
163
  except json.JSONDecodeError as e:
164
- print(f"Error parsing JSON: {e}")
165
- print(f"Raw output: {generated_text}")
 
 
 
 
 
 
 
 
 
 
 
166
  return []
167
 
168
 
@@ -171,7 +238,7 @@ _generator_instance = None
171
 
172
 
173
  def get_generator(
174
- provider_mode="demo",
175
  own_base_url=None,
176
  own_api_key=None,
177
  own_model=None,
@@ -183,7 +250,7 @@ def get_generator(
183
  Get a script generator instance with flexible provider support.
184
 
185
  Args:
186
- provider_mode: "demo", "own_inference", or "openai"
187
  own_base_url: Base URL for own inference server
188
  own_api_key: API key for own inference server
189
  own_model: Model name for own inference server
@@ -196,41 +263,26 @@ def get_generator(
196
  """
197
  global _generator_instance
198
 
199
- # Always create new instance for OpenAI or own_inference with custom settings
200
- # Reuse demo instance if same config
201
  if provider_mode == "openai":
202
  if not openai_key:
203
- print(
204
- "Warning: OpenAI selected but no API key provided. Falling back to demo mode."
205
- )
206
- provider_mode = "demo"
207
- else:
208
- return ScriptGenerator(
209
- provider_mode="openai",
210
- openai_key=openai_key,
211
- openai_model=openai_model,
212
- max_tokens=max_tokens or MAX_TOKENS,
213
- )
214
 
215
  if provider_mode == "own_inference":
216
  if not own_base_url:
217
- print(
218
- "Warning: Own Inference selected but no base URL provided. Falling back to demo mode."
219
- )
220
- provider_mode = "demo"
221
- else:
222
- return ScriptGenerator(
223
- provider_mode="own_inference",
224
- own_base_url=own_base_url,
225
- own_api_key=own_api_key,
226
- own_model=own_model,
227
- max_tokens=max_tokens or MAX_TOKENS,
228
- )
229
-
230
- # Demo mode - reuse global instance
231
- if _generator_instance is None or provider_mode == "demo":
232
- _generator_instance = ScriptGenerator(
233
- provider_mode="demo",
234
  max_tokens=max_tokens or MAX_TOKENS,
235
  )
236
- return _generator_instance
 
 
 
1
  import json
2
 
3
  import httpx
4
  from openai import OpenAI
5
 
6
  from utils.config import (
 
 
 
 
7
  MAX_TOKENS,
8
  SCRIPT_GENERATION_MODEL,
9
  TEMPERATURE,
 
13
  class ScriptGenerator:
14
  def __init__(
15
  self,
16
+ provider_mode="own_inference",
17
  own_base_url=None,
18
  own_api_key=None,
19
  own_model=None,
 
25
  Initialize ScriptGenerator with flexible provider support.
26
 
27
  Args:
28
+ provider_mode: "own_inference" or "openai"
29
  own_base_url: Base URL for own inference server
30
  own_api_key: API key for own inference server
31
  own_model: Model name for own inference server
 
36
  self.provider_mode = provider_mode
37
  self.max_tokens = max_tokens or MAX_TOKENS
38
 
39
+ if provider_mode == "own_inference":
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # Own inference server
41
  print(f"Connecting to own inference API: {own_base_url}")
42
 
 
76
  else:
77
  raise ValueError(f"Invalid provider_mode: {provider_mode}")
78
 
79
+ def generate_podcast_script(self, paper_text: str, target_dialogue_count: int = 15) -> list:
80
  """
81
  Generates a podcast script from the given paper text.
82
 
83
  Args:
84
  paper_text (str): The text content of the research paper.
85
+ target_dialogue_count (int): Target number of dialogue exchanges (default: 15)
86
 
87
  Returns:
88
  list: A list of dictionaries representing the dialogue.
89
  """
90
 
91
+ system_prompt = f"""You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two hosts:
92
  - Host (Alex): Enthusiastic, asks clarifying questions, guides the conversation.
93
  - Guest (Jamie): Expert researcher, explains concepts simply but accurately.
94
 
 
96
  1. The Host MUST ALWAYS start with "Welcome to PaperCast!" - This is the show's branding and must never be skipped.
97
  2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
98
  3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
99
+ 4. Generate EXACTLY {target_dialogue_count} dialogue exchanges (back-and-forth between Host and Guest). Do not exceed this count.
100
+ 5. Each exchange should be substantive but concise. Keep individual dialogue turns focused and conversational.
101
 
102
  Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
103
  - "speaker": "Host" or "Guest"
 
106
 
107
  Example:
108
  [
109
+ {{"speaker": "Host", "text": "Welcome to PaperCast! Today we're diving into something really cool.", "emotion": "excited"}},
110
+ {{"speaker": "Guest", "text": "That's right, Alex. We're looking at a new way to handle large language models.", "emotion": "happy"}}
111
  ]
112
 
113
  Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
114
+ IMPORTANT: Generate exactly {target_dialogue_count} dialogue items total. No more, no less.
115
  """
116
 
117
+ user_prompt = f"Here is the research paper text. Generate a podcast script with EXACTLY {target_dialogue_count} dialogue exchanges, summarizing the key findings, methodology, and implications.\n\n{paper_text[:10000]}..."
118
 
119
  messages = [
120
  {"role": "system", "content": system_prompt},
 
125
  f"Generating script with {self.provider_mode} (model: {self.model_name})..."
126
  )
127
 
128
+ # Call LLM with appropriate parameters
129
+ # OpenAI's newer models use max_completion_tokens instead of max_tokens
130
+ # All OpenAI models support JSON mode (response_format)
131
+ if self.provider_mode == "openai":
132
+ # Check if this is a reasoning model (o1, o3, gpt-5 series except gpt-5-chat)
133
+ # Reasoning models don't support temperature parameter
134
+ is_reasoning_model = any(
135
+ keyword in self.model_name.lower()
136
+ for keyword in ["o1", "o3", "o4", "gpt-5"]
137
+ ) and "chat" not in self.model_name.lower()
138
+
139
+ # Common parameters for all OpenAI models
140
+ common_params = {
141
+ "model": self.model_name,
142
+ "messages": messages,
143
+ "response_format": {"type": "json_object"}, # JSON mode for all OpenAI models
144
+ }
145
+
146
+ if is_reasoning_model:
147
+ # Reasoning models: no temperature parameter
148
+ # Determine appropriate reasoning_effort based on model
149
+ model_lower = self.model_name.lower()
150
+
151
+ # GPT-5 series supports "minimal" (fastest)
152
+ # O-series only supports "low", "medium", "high"
153
+ # 2x multiplier: user's desired tokens + reasoning headroom
154
+ if "gpt-5" in model_lower:
155
+ # GPT-5, GPT-5-mini, GPT-5-nano, GPT-5.1 all support "minimal"
156
+ reasoning_effort_value = "minimal"
157
+ reasoning_tokens = self.max_tokens * 2 # 2x: desired output + minimal reasoning
158
+ effort_desc = "minimal (fastest for GPT-5 series)"
159
+ elif any(x in model_lower for x in ["o1-preview", "o1-mini"]):
160
+ # Old O-series don't support reasoning_effort parameter at all
161
+ reasoning_effort_value = None
162
+ reasoning_tokens = self.max_tokens * 2 # 2x for default reasoning
163
+ effort_desc = "default (no reasoning_effort support)"
164
+ else:
165
+ # O1, O3, O4 series support "low" as minimum
166
+ reasoning_effort_value = "low"
167
+ reasoning_tokens = self.max_tokens * 2 # 2x: desired output + low reasoning
168
+ effort_desc = "low (fastest for O-series)"
169
+
170
+ print(f" ℹ️ Reasoning model detected - temperature disabled, tokens increased to {reasoning_tokens}")
171
+ print(f" (Using {effort_desc} + JSON mode)")
172
+
173
+ # Add reasoning-specific parameters
174
+ common_params["max_completion_tokens"] = reasoning_tokens
175
+ if reasoning_effort_value:
176
+ common_params["reasoning_effort"] = reasoning_effort_value
177
+
178
+ response = self.client.chat.completions.create(**common_params)
179
+ else:
180
+ # Regular chat models: include temperature
181
+ print(f" ℹ️ Chat model with JSON mode enabled")
182
+ common_params["max_completion_tokens"] = self.max_tokens
183
+ common_params["temperature"] = TEMPERATURE
184
+ response = self.client.chat.completions.create(**common_params)
185
+ else:
186
+ # Own inference servers typically use max_tokens
187
+ response = self.client.chat.completions.create(
188
+ model=self.model_name,
189
+ messages=messages,
190
+ max_tokens=self.max_tokens,
191
+ temperature=TEMPERATURE,
192
+ )
193
  generated_text = response.choices[0].message.content
194
 
195
+ # Debug: Print raw output info
196
+ print(f" 📄 Response length: {len(generated_text) if generated_text else 0} characters")
197
+
198
+ if not generated_text:
199
+ print("❌ Error: Model returned empty response")
200
+ print(f"Full response object: {response}")
201
+ return []
202
+
203
  # Extract JSON from the response
204
  try:
205
  # Find the first '[' and last ']'
 
208
  if start_index != -1 and end_index != -1:
209
  json_str = generated_text[start_index:end_index]
210
  script = json.loads(json_str)
211
+ print(f" ✅ Successfully parsed {len(script)} dialogue items")
212
  return script
213
  else:
214
+ print("No JSON found in output.")
215
+ print(f"📝 Raw output preview (first 500 chars):\n{generated_text[:500]}")
216
+ if len(generated_text) > 500:
217
+ print(f"... (truncated, total length: {len(generated_text)})")
218
  return []
219
  except json.JSONDecodeError as e:
220
+ print(f"Error parsing JSON: {e}")
221
+
222
+ # Show context around the error location
223
+ if hasattr(e, 'pos') and e.pos:
224
+ error_pos = e.pos
225
+ context_start = max(0, error_pos - 200)
226
+ context_end = min(len(generated_text), error_pos + 200)
227
+ print(f"\n📍 Error at position {error_pos}:")
228
+ print(f"Context:\n...{generated_text[context_start:context_end]}...")
229
+
230
+ print(f"\n📝 Full output (first 1000 chars):\n{generated_text[:1000]}")
231
+ if len(generated_text) > 1000:
232
+ print(f"\n... (truncated, total length: {len(generated_text)} chars)")
233
  return []
234
 
235
 
 
238
 
239
 
240
  def get_generator(
241
+ provider_mode="own_inference",
242
  own_base_url=None,
243
  own_api_key=None,
244
  own_model=None,
 
250
  Get a script generator instance with flexible provider support.
251
 
252
  Args:
253
+ provider_mode: "own_inference" or "openai"
254
  own_base_url: Base URL for own inference server
255
  own_api_key: API key for own inference server
256
  own_model: Model name for own inference server
 
263
  """
264
  global _generator_instance
265
 
266
+ # Create instance based on provider mode
 
267
  if provider_mode == "openai":
268
  if not openai_key:
269
+ raise ValueError("OpenAI API key is required for OpenAI provider mode")
270
+ return ScriptGenerator(
271
+ provider_mode="openai",
272
+ openai_key=openai_key,
273
+ openai_model=openai_model,
274
+ max_tokens=max_tokens or MAX_TOKENS,
275
+ )
 
 
 
 
276
 
277
  if provider_mode == "own_inference":
278
  if not own_base_url:
279
+ raise ValueError("Base URL is required for own inference provider mode")
280
+ return ScriptGenerator(
281
+ provider_mode="own_inference",
282
+ own_base_url=own_base_url,
283
+ own_api_key=own_api_key,
284
+ own_model=own_model,
 
 
 
 
 
 
 
 
 
 
 
285
  max_tokens=max_tokens or MAX_TOKENS,
286
  )
287
+
288
+ raise ValueError(f"Invalid provider_mode: {provider_mode}")
live.py CHANGED
@@ -3,50 +3,65 @@ import subprocess
3
  import datetime
4
 
5
  # ---------------------------------------------------------------------------
6
- # Lütfen curl komutunuzu tırnak işaretleri arasına yapıştırın.
7
- # Örnek: curl -X POST http://api.example.com/update
8
  # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
9
  CURL_COMMAND = """
10
- curl --location 'https://8000-dep-01kady4n8bfqjjatmpqtzhdcp9-d.cloudspaces.litng.ai/v1/chat/completions' \
11
  --header 'Content-Type: application/json' \
12
- --header 'Authorization: Basic YmF0dTpCYXR1aGFuMTIz' \
13
  --data '{
14
- "model": "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit",
15
  "messages": [
16
  {
17
  "role": "user",
18
- "content": "You are a helpful assistant. How manny letters in strawberry?"
19
  }
20
  ]
21
  }'
22
  """
 
 
 
 
23
  # ---------------------------------------------------------------------------
24
 
25
  def run_periodically():
26
- print(f"Script başlatıldı: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
27
- print(f"Komut: {CURL_COMMAND.strip()}")
28
  print("-" * 50)
29
-
 
 
 
 
 
30
  while True:
31
  try:
32
  current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
33
- print(f"[{current_time}] İstek gönderiliyor...")
34
-
35
- # shell=True, komutun terminaldeki gibi çalışmasını sağlar
36
  result = subprocess.run(CURL_COMMAND, shell=True, capture_output=True, text=True)
37
-
38
  if result.returncode == 0:
39
- print(f"Başarılı! Çıktı (ilk 100 karakter): {result.stdout[:100]}...")
40
  else:
41
- print(f"Hata kodu: {result.returncode}")
42
- print(f"Hata çıktısı: {result.stderr}")
43
-
44
  except Exception as e:
45
- print(f"Beklenmedik bir hata oluştu: {e}")
46
-
47
- print("60 saniye bekleniyor...")
48
  print("-" * 50)
49
- time.sleep(60)
50
 
51
  if __name__ == "__main__":
52
  run_periodically()
 
3
  import datetime
4
 
5
  # ---------------------------------------------------------------------------
6
+ # OPTIONAL: Keep-Alive Script for Inference Servers
 
7
  # ---------------------------------------------------------------------------
8
+ # This script sends periodic requests to keep inference servers active.
9
+ # Configure your own endpoint and credentials below if needed.
10
+ #
11
+ # Note: This is optional and only useful if you're hosting your own
12
+ # inference server that goes to sleep after inactivity.
13
+ # ---------------------------------------------------------------------------
14
+
15
+ # Replace with your own inference endpoint and credentials
16
  CURL_COMMAND = """
17
+ curl --location 'YOUR_INFERENCE_URL_HERE' \
18
  --header 'Content-Type: application/json' \
19
+ --header 'Authorization: YOUR_AUTH_HEADER_HERE' \
20
  --data '{
21
+ "model": "your-model-name",
22
  "messages": [
23
  {
24
  "role": "user",
25
+ "content": "Hello, this is a keep-alive ping."
26
  }
27
  ]
28
  }'
29
  """
30
+
31
+ # How often to send requests (in seconds)
32
+ INTERVAL_SECONDS = 60
33
+
34
  # ---------------------------------------------------------------------------
35
 
36
  def run_periodically():
37
+ print(f"Keep-alive script started: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
38
+ print(f"Interval: {INTERVAL_SECONDS} seconds")
39
  print("-" * 50)
40
+
41
+ if "YOUR_INFERENCE_URL_HERE" in CURL_COMMAND:
42
+ print("⚠️ WARNING: Please configure CURL_COMMAND with your actual endpoint!")
43
+ print("⚠️ Edit this file and replace the placeholder values.")
44
+ return
45
+
46
  while True:
47
  try:
48
  current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
49
+ print(f"[{current_time}] Sending keep-alive request...")
50
+
 
51
  result = subprocess.run(CURL_COMMAND, shell=True, capture_output=True, text=True)
52
+
53
  if result.returncode == 0:
54
+ print(f" Success! Response (first 100 chars): {result.stdout[:100]}...")
55
  else:
56
+ print(f" Error code: {result.returncode}")
57
+ print(f"Error output: {result.stderr}")
58
+
59
  except Exception as e:
60
+ print(f"Unexpected error: {e}")
61
+
62
+ print(f"Waiting {INTERVAL_SECONDS} seconds...")
63
  print("-" * 50)
64
+ time.sleep(INTERVAL_SECONDS)
65
 
66
  if __name__ == "__main__":
67
  run_periodically()
output/history.json CHANGED
@@ -54,5 +54,12 @@
54
  "script_length": 11,
55
  "timestamp": "2025-11-19 23:07:42",
56
  "audio_filename": "podcast_20251119_230742.wav"
 
 
 
 
 
 
 
57
  }
58
  ]
 
54
  "script_length": 11,
55
  "timestamp": "2025-11-19 23:07:42",
56
  "audio_filename": "podcast_20251119_230742.wav"
57
+ },
58
+ {
59
+ "url": "https://arxiv.org/abs/2204.06125",
60
+ "audio_path": "/home/batuhan/lab/papercast/output/podcast_20251121_221210.wav",
61
+ "script_length": 21,
62
+ "timestamp": "2025-11-21 22:12:10",
63
+ "audio_filename": "podcast_20251121_221210.wav"
64
  }
65
  ]
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  beautifulsoup4
2
- edge-tts
3
  elevenlabs
4
  gradio
5
  mcp
 
1
  beautifulsoup4
 
2
  elevenlabs
3
  gradio
4
  mcp
synthesis/tts_engine.py CHANGED
@@ -1,9 +1,7 @@
1
- import asyncio
2
  import os
3
  from datetime import datetime
4
  from io import BytesIO
5
 
6
- import edge_tts
7
  from elevenlabs import ElevenLabs, VoiceSettings
8
  from pydub import AudioSegment
9
 
@@ -14,38 +12,7 @@ from utils.config import (
14
  OUTPUT_DIR,
15
  )
16
 
17
- # Edge-TTS Voice Options
18
- EDGE_TTS_VOICES = {
19
- # English (US) - Male
20
- "Guy (US Male - Casual)": "en-US-GuyNeural",
21
- "Christopher (US Male - Authoritative)": "en-US-ChristopherNeural",
22
- "Eric (US Male - Professional)": "en-US-EricNeural",
23
- "Steffan (US Male - Energetic)": "en-US-SteffanNeural",
24
- "Roger (US Male - Elderly)": "en-US-RogerNeural",
25
-
26
- # English (US) - Female
27
- "Jenny (US Female - Friendly)": "en-US-JennyNeural",
28
- "Aria (US Female - Professional)": "en-US-AriaNeural",
29
- "Michelle (US Female - Enthusiastic)": "en-US-MichelleNeural",
30
- "Sara (US Female - News Anchor)": "en-US-SaraNeural",
31
- "Ana (US Female - Child)": "en-US-AnaNeural",
32
-
33
- # English (UK)
34
- "Ryan (UK Male)": "en-GB-RyanNeural",
35
- "Thomas (UK Male - Elderly)": "en-GB-ThomasNeural",
36
- "Sonia (UK Female)": "en-GB-SoniaNeural",
37
- "Libby (UK Female - Enthusiastic)": "en-GB-LibbyNeural",
38
-
39
- # English (Australia)
40
- "William (AU Male)": "en-AU-WilliamNeural",
41
- "Natasha (AU Female)": "en-AU-NatashaNeural",
42
-
43
- # English (India)
44
- "Prabhat (IN Male)": "en-IN-PrabhatNeural",
45
- "Neerja (IN Female)": "en-IN-NeerjaNeural",
46
- }
47
-
48
- # ElevenLabs Voice Options (popular voices)
49
  ELEVENLABS_VOICES = {
50
  # Male Voices
51
  "Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
@@ -80,58 +47,45 @@ def generate_unique_filename():
80
 
81
 
82
  class TTSEngine:
83
- def __init__(self, tts_provider="edge-tts", custom_api_key=None, host_voice=None, guest_voice=None):
84
  """
85
- Initialize TTS Engine with specified provider.
86
 
87
  Args:
88
- tts_provider: "edge-tts" or "elevenlabs"
89
- custom_api_key: API key for ElevenLabs (only used if provider is "elevenlabs")
90
- host_voice: Voice ID/name for Host (optional, uses default if not provided)
91
- guest_voice: Voice ID/name for Guest (optional, uses default if not provided)
92
  """
93
- self.mode = tts_provider.lower()
94
-
95
- if self.mode == "elevenlabs":
96
- print("Initializing ElevenLabs TTS API...")
97
- # Use custom key if provided, otherwise use default
98
- api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY
99
- self.client = ElevenLabs(api_key=api_key)
100
-
101
- # Use custom voices or defaults
102
- self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
103
- self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
104
-
105
- if custom_api_key:
106
- print("✓ ElevenLabs TTS ready (custom API key)")
107
- else:
108
- print("✓ ElevenLabs TTS ready (demo API key)")
109
-
110
- # Print selected voices
111
- host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
112
- guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
113
- print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
114
- print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
115
-
116
- elif self.mode == "edge-tts":
117
- print("Initializing Edge-TTS (Microsoft)...")
118
- # Use custom voices or defaults
119
- self.host_voice = host_voice if host_voice else "en-US-GuyNeural"
120
- self.guest_voice = guest_voice if guest_voice else "en-US-JennyNeural"
121
- print("✓ Edge-TTS ready (free, no API key required)")
122
-
123
- # Print selected voices
124
- host_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.host_voice]
125
- guest_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.guest_voice]
126
- print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
127
- print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
128
 
 
 
 
 
 
 
129
  else:
130
- raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'edge-tts' or 'elevenlabs'")
 
 
 
 
 
 
131
 
132
  def synthesize_dialogue(self, script: list) -> str:
133
  """
134
- Synthesize the script to audio using selected TTS provider.
135
 
136
  Args:
137
  script: List of dialogue items
@@ -139,12 +93,7 @@ class TTSEngine:
139
  Returns:
140
  str: Path to the generated audio file
141
  """
142
- if self.mode == "elevenlabs":
143
- return self._synthesize_elevenlabs(script)
144
- elif self.mode == "edge-tts":
145
- return self._synthesize_edge_tts(script)
146
- else:
147
- raise ValueError(f"Unknown TTS mode: {self.mode}")
148
 
149
  def _synthesize_elevenlabs(self, script: list) -> str:
150
  """Synthesize using ElevenLabs API"""
@@ -154,23 +103,30 @@ class TTSEngine:
154
  for i, item in enumerate(script):
155
  text = item["text"]
156
  speaker = item["speaker"]
 
 
 
 
 
 
157
 
158
  # Select voice based on speaker
159
  voice_id = self.guest_voice_id if speaker == "Guest" else self.host_voice_id
160
 
161
  try:
162
- print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
163
 
164
- # Generate audio using ElevenLabs
 
165
  audio_generator = self.client.text_to_speech.convert(
166
  voice_id=voice_id,
167
  text=text,
168
- model_id="eleven_multilingual_v2",
169
  voice_settings=VoiceSettings(
170
- stability=0.5,
171
- similarity_boost=0.75,
172
- style=0.5,
173
- use_speaker_boost=True,
174
  ),
175
  )
176
 
@@ -207,123 +163,20 @@ class TTSEngine:
207
 
208
  return output_path
209
 
210
- def _synthesize_edge_tts(self, script: list) -> str:
211
- """Synthesize using Edge-TTS (Microsoft)"""
212
- print("Synthesizing audio via Edge-TTS (Microsoft)...")
213
- audio_segments = []
214
-
215
- for i, item in enumerate(script):
216
- text = item["text"]
217
- speaker = item["speaker"]
218
-
219
- # Select voice based on speaker
220
- voice = self.guest_voice if speaker == "Guest" else self.host_voice
221
-
222
- try:
223
- print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
224
-
225
- # Generate audio using Edge-TTS (synchronous wrapper for async)
226
- audio_bytes = asyncio.run(self._edge_tts_synthesize(text, voice))
227
-
228
- # Convert to AudioSegment
229
- audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
230
-
231
- # Trim silence from the end of the audio (Edge-TTS adds trailing silence)
232
- # Detect silence threshold: -40 dBFS
233
- audio_segment = self._trim_silence(audio_segment)
234
-
235
- audio_segments.append(audio_segment)
236
-
237
- # Add minimal silence between speakers (50ms for natural flow)
238
- silence = AudioSegment.silent(duration=50)
239
- audio_segments.append(silence)
240
-
241
- print(f"✓ Synthesized line {i + 1}/{len(script)}")
242
-
243
- except Exception as e:
244
- print(f"Error synthesizing line '{text[:50]}...': {e}")
245
- # Continue with next line even if one fails
246
-
247
- if not audio_segments:
248
- print("No audio generated")
249
- return ""
250
-
251
- # Combine all segments
252
- print("Combining audio segments...")
253
- combined = sum(audio_segments)
254
-
255
- # Export as WAV with unique filename
256
- filename = generate_unique_filename()
257
- output_path = os.path.join(OUTPUT_DIR, filename)
258
- combined.export(output_path, format="wav")
259
- print(f"✓ Podcast saved to: {output_path}")
260
-
261
- return output_path
262
-
263
- async def _edge_tts_synthesize(self, text: str, voice: str) -> bytes:
264
- """
265
- Async helper to synthesize text using Edge-TTS.
266
-
267
- Args:
268
- text: Text to synthesize
269
- voice: Voice name to use
270
-
271
- Returns:
272
- bytes: Audio data in MP3 format
273
- """
274
- communicate = edge_tts.Communicate(text, voice)
275
- audio_data = b""
276
-
277
- async for chunk in communicate.stream():
278
- if chunk["type"] == "audio":
279
- audio_data += chunk["data"]
280
-
281
- return audio_data
282
-
283
- def _trim_silence(self, audio_segment, silence_thresh=-40, chunk_size=10):
284
- """
285
- Trim silence from the end of audio segment.
286
-
287
- Args:
288
- audio_segment: AudioSegment to trim
289
- silence_thresh: Silence threshold in dBFS (default: -40)
290
- chunk_size: Size of chunks to analyze in ms (default: 10)
291
-
292
- Returns:
293
- Trimmed AudioSegment
294
- """
295
- # Start from the end and find where audio actually ends
296
- trim_ms = 0
297
-
298
- # Check from the end in chunks
299
- for i in range(len(audio_segment) - chunk_size, 0, -chunk_size):
300
- chunk = audio_segment[i:i + chunk_size]
301
- if chunk.dBFS > silence_thresh:
302
- # Found non-silent audio
303
- trim_ms = i + chunk_size
304
- break
305
-
306
- # If we found non-silent audio, trim there
307
- if trim_ms > 0:
308
- return audio_segment[:trim_ms]
309
-
310
- # Otherwise return original
311
- return audio_segment
312
-
313
 
314
  # Global instance
315
  _tts_instance = None
316
 
317
 
318
- def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None, guest_voice=None):
319
  """
320
- Get TTS engine instance with optional provider, API key, and voices.
321
 
322
  Args:
323
- tts_provider: "edge-tts" or "elevenlabs" (default: "edge-tts")
324
- custom_api_key: Optional custom ElevenLabs API key (only used for ElevenLabs)
325
- host_voice: Voice ID/name for Host (optional)
326
- guest_voice: Voice ID/name for Guest (optional)
327
 
328
  Returns:
329
  TTSEngine instance
@@ -331,7 +184,7 @@ def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None
331
  global _tts_instance
332
 
333
  # Always create new instance if custom settings provided
334
- if custom_api_key or tts_provider != "edge-tts" or host_voice or guest_voice:
335
  return TTSEngine(
336
  tts_provider=tts_provider,
337
  custom_api_key=custom_api_key,
@@ -339,7 +192,7 @@ def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None
339
  guest_voice=guest_voice
340
  )
341
 
342
- # Otherwise, reuse global instance (for default Edge-TTS)
343
  if _tts_instance is None:
344
- _tts_instance = TTSEngine(tts_provider="edge-tts")
345
  return _tts_instance
 
 
1
  import os
2
  from datetime import datetime
3
  from io import BytesIO
4
 
 
5
  from elevenlabs import ElevenLabs, VoiceSettings
6
  from pydub import AudioSegment
7
 
 
12
  OUTPUT_DIR,
13
  )
14
 
15
+ # ElevenLabs Voice Options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  ELEVENLABS_VOICES = {
17
  # Male Voices
18
  "Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
 
47
 
48
 
49
  class TTSEngine:
50
+ def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
51
  """
52
+ Initialize TTS Engine with ElevenLabs.
53
 
54
  Args:
55
+ tts_provider: Must be "elevenlabs" (kept for compatibility)
56
+ custom_api_key: API key for ElevenLabs (required)
57
+ host_voice: Voice ID for Host (optional, uses default if not provided)
58
+ guest_voice: Voice ID for Guest (optional, uses default if not provided)
59
  """
60
+ self.mode = "elevenlabs"
61
+
62
+ print("Initializing ElevenLabs TTS API...")
63
+ # Use custom key if provided, otherwise use default
64
+ api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY
65
+
66
+ if not api_key:
67
+ raise ValueError("ElevenLabs API key is required")
68
+
69
+ self.client = ElevenLabs(api_key=api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Use custom voices or defaults
72
+ self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
73
+ self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
74
+
75
+ if custom_api_key:
76
+ print("✓ ElevenLabs TTS ready (custom API key)")
77
  else:
78
+ print(" ElevenLabs TTS ready")
79
+
80
+ # Print selected voices
81
+ host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
82
+ guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
83
+ print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
84
+ print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
85
 
86
  def synthesize_dialogue(self, script: list) -> str:
87
  """
88
+ Synthesize the script to audio using ElevenLabs.
89
 
90
  Args:
91
  script: List of dialogue items
 
93
  Returns:
94
  str: Path to the generated audio file
95
  """
96
+ return self._synthesize_elevenlabs(script)
 
 
 
 
 
97
 
98
  def _synthesize_elevenlabs(self, script: list) -> str:
99
  """Synthesize using ElevenLabs API"""
 
103
  for i, item in enumerate(script):
104
  text = item["text"]
105
  speaker = item["speaker"]
106
+ emotion = item.get("emotion", "neutral")
107
+
108
+ # Note: ElevenLabs doesn't have a direct emotion parameter.
109
+ # Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.)
110
+ # which the script generator already creates based on the emotion field.
111
+ # We log the emotion for debugging but don't modify the text (would be spoken out loud).
112
 
113
  # Select voice based on speaker
114
  voice_id = self.guest_voice_id if speaker == "Guest" else self.host_voice_id
115
 
116
  try:
117
+ print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
118
 
119
+ # Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed
120
+ # Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2
121
  audio_generator = self.client.text_to_speech.convert(
122
  voice_id=voice_id,
123
  text=text,
124
+ model_id="eleven_turbo_v2_5", # Upgraded from multilingual_v2 for better quality
125
  voice_settings=VoiceSettings(
126
+ stability=0.4, # Lower = more expressiveness and variation (default: 0.5)
127
+ similarity_boost=0.8, # Higher = better voice consistency (default: 0.75)
128
+ style=0.6, # Higher = more dynamic, expressive delivery (default: 0.5)
129
+ use_speaker_boost=True, # Enhances similarity to original voice
130
  ),
131
  )
132
 
 
163
 
164
  return output_path
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  # Global instance
168
  _tts_instance = None
169
 
170
 
171
+ def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
172
  """
173
+ Get TTS engine instance with ElevenLabs.
174
 
175
  Args:
176
+ tts_provider: Must be "elevenlabs" (kept for compatibility)
177
+ custom_api_key: ElevenLabs API key (required)
178
+ host_voice: Voice ID for Host (optional)
179
+ guest_voice: Voice ID for Guest (optional)
180
 
181
  Returns:
182
  TTSEngine instance
 
184
  global _tts_instance
185
 
186
  # Always create new instance if custom settings provided
187
+ if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice:
188
  return TTSEngine(
189
  tts_provider=tts_provider,
190
  custom_api_key=custom_api_key,
 
192
  guest_voice=guest_voice
193
  )
194
 
195
+ # Otherwise, reuse global instance (for default ElevenLabs)
196
  if _tts_instance is None:
197
+ _tts_instance = TTSEngine(tts_provider="elevenlabs")
198
  return _tts_instance
utils/config.py CHANGED
@@ -5,39 +5,23 @@ from dotenv import load_dotenv
5
  # Load environment variables from .env.local
6
  load_dotenv(os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env.local"))
7
 
8
- # Demo Mode Configuration - Load from environment variable
9
- # Set DEMO_MODE=true in .env.local or HuggingFace Spaces secrets
10
- DEMO_MODE = True
11
-
12
  # Model Configurations
13
  SCRIPT_GENERATION_MODEL = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
14
 
15
- # LLM API Inference Settings (Cloud GPU) - Load from .env.local
16
- INFERENCE_API_URL = os.getenv("DEMO_INFERENCE_URL")
17
- INFERENCE_API_KEY = os.getenv("DEMO_INFERENCE_KEY")
 
 
18
 
19
  # TTS API Settings (ElevenLabs)
20
-
21
- # Load from .env.local
22
- ELEVENLABS_API_KEY = os.getenv("DEMO_TTS_KEY")
23
 
24
  # ElevenLabs Voice IDs (you can change these to different voices)
25
  # Find more voices at: https://api.elevenlabs.io/v1/voices
26
  ELEVENLABS_HOST_VOICE = "ErXwobaYiN019PkySvjV" # Antoni - male voice for Host
27
  ELEVENLABS_GUEST_VOICE = "EXAVITQu4vr4xnSDxMaL" # Bella - female voice for Guest
28
 
29
- # Demo Mode Settings (loaded from .env.local)
30
- DEMO_INFERENCE_URL = INFERENCE_API_URL
31
- DEMO_INFERENCE_KEY = INFERENCE_API_KEY
32
- DEMO_MODEL = SCRIPT_GENERATION_MODEL
33
- DEMO_TTS_KEY = ELEVENLABS_API_KEY
34
-
35
- # Optional: Additional API keys for non-demo mode
36
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
37
- CUSTOM_ELEVENLABS_KEY = os.getenv("CUSTOM_ELEVENLABS_KEY", "")
38
- CUSTOM_INFERENCE_URL = os.getenv("CUSTOM_INFERENCE_URL", "")
39
- CUSTOM_INFERENCE_KEY = os.getenv("CUSTOM_INFERENCE_KEY", "")
40
-
41
  # Paths
42
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
43
  TEMP_DIR = os.path.join(BASE_DIR, "temp")
 
5
  # Load environment variables from .env.local
6
  load_dotenv(os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env.local"))
7
 
 
 
 
 
8
  # Model Configurations
9
  SCRIPT_GENERATION_MODEL = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
10
 
11
+ # User API Keys (Bring Your Own Key - BYOK)
12
+ # Users provide these through the settings interface or environment variables
13
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
14
+ INFERENCE_API_URL = os.getenv("INFERENCE_API_URL", "")
15
+ INFERENCE_API_KEY = os.getenv("INFERENCE_API_KEY", "")
16
 
17
  # TTS API Settings (ElevenLabs)
18
+ ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
 
 
19
 
20
  # ElevenLabs Voice IDs (you can change these to different voices)
21
  # Find more voices at: https://api.elevenlabs.io/v1/voices
22
  ELEVENLABS_HOST_VOICE = "ErXwobaYiN019PkySvjV" # Antoni - male voice for Host
23
  ELEVENLABS_GUEST_VOICE = "EXAVITQu4vr4xnSDxMaL" # Bella - female voice for Guest
24
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Paths
26
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27
  TEMP_DIR = os.path.join(BASE_DIR, "temp")