Spaces:
Running
Running
batuhanozkose
commited on
Commit
·
472739a
0
Parent(s):
feat: Implement initial PaperCast application with core modules, documentation, a periodic curl script, and a Gradio certificate.
Browse files- .gitignore +53 -0
- .gradio/certificate.pem +31 -0
- CLAUDE.md +177 -0
- PAPERCAST_PROJECT_BRIEF.md +332 -0
- README.md +95 -0
- agents/__init__.py +1 -0
- agents/podcast_agent.py +349 -0
- app.py +1203 -0
- generation/__init__.py +1 -0
- generation/script_generator.py +236 -0
- live.py +52 -0
- mcp_servers/__init__.py +1 -0
- mcp_servers/paper_tools_server.py +32 -0
- output/history.json +58 -0
- plan.md +90 -0
- processing/__init__.py +1 -0
- processing/pdf_reader.py +21 -0
- processing/url_fetcher.py +56 -0
- requirements.txt +12 -0
- synthesis/__init__.py +1 -0
- synthesis/tts_engine.py +345 -0
- todo.md +105 -0
- utils/__init__.py +1 -0
- utils/config.py +56 -0
- utils/history.py +58 -0
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
env/
|
| 26 |
+
ENV/
|
| 27 |
+
.venv
|
| 28 |
+
|
| 29 |
+
# Environment Variables
|
| 30 |
+
.env
|
| 31 |
+
.env.local
|
| 32 |
+
|
| 33 |
+
# IDEs
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
| 38 |
+
*~
|
| 39 |
+
|
| 40 |
+
# OS
|
| 41 |
+
.DS_Store
|
| 42 |
+
Thumbs.db
|
| 43 |
+
|
| 44 |
+
# Project Specific
|
| 45 |
+
*.pdf
|
| 46 |
+
*.mp3
|
| 47 |
+
*.wav
|
| 48 |
+
cache/
|
| 49 |
+
outputs/
|
| 50 |
+
temp/
|
| 51 |
+
|
| 52 |
+
# HuggingFace
|
| 53 |
+
.cache/
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLAUDE.md
|
| 2 |
+
|
| 3 |
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
| 4 |
+
|
| 5 |
+
## Project Overview
|
| 6 |
+
|
| 7 |
+
PaperCast is an AI agent application that transforms research papers into engaging podcast-style audio conversations. It takes arXiv URLs or PDF uploads as input, analyzes the paper, generates a natural dialogue between a host and expert, and produces downloadable audio with distinct voices.
|
| 8 |
+
|
| 9 |
+
**Target Platform:** HuggingFace Spaces (Gradio 6 application)
|
| 10 |
+
**Hackathon:** MCP 1st Birthday - Track 2 (MCP in Action - Consumer)
|
| 11 |
+
**Required Tag:** `mcp-in-action-track-consumer`
|
| 12 |
+
|
| 13 |
+
## Development Commands
|
| 14 |
+
|
| 15 |
+
### Environment Setup
|
| 16 |
+
```bash
|
| 17 |
+
pip install -r requirements.txt
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
### Running Locally
|
| 21 |
+
```bash
|
| 22 |
+
python app.py
|
| 23 |
+
# Or: gradio app.py
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Testing on HuggingFace Spaces
|
| 27 |
+
The application must be deployed to HuggingFace Spaces under the `MCP-1st-Birthday` organization.
|
| 28 |
+
|
| 29 |
+
## Architecture Overview
|
| 30 |
+
|
| 31 |
+
### Core Pipeline Flow
|
| 32 |
+
1. **Input Processing**: Accept arXiv URL or PDF upload
|
| 33 |
+
2. **Paper Extraction**: Extract text content from PDF
|
| 34 |
+
3. **Agent Analysis**: Identify paper structure (abstract, methodology, findings, conclusions)
|
| 35 |
+
4. **Script Generation**: Create natural dialogue between Host and Guest characters
|
| 36 |
+
5. **Audio Synthesis**: Generate audio with distinct voices for each speaker
|
| 37 |
+
6. **Output Delivery**: Provide transcript and audio file for download
|
| 38 |
+
|
| 39 |
+
### Agent Behaviors (Critical for Track 2)
|
| 40 |
+
The application MUST demonstrate autonomous agent capabilities:
|
| 41 |
+
- **Planning**: Analyze paper structure and determine conversation flow strategy
|
| 42 |
+
- **Reasoning**: Identify which concepts need simplification, determine appropriate depth
|
| 43 |
+
- **Execution**: Orchestrate multi-step pipeline (fetch → extract → analyze → generate → synthesize)
|
| 44 |
+
- **Context Management**: Maintain coherence across the dialogue, referencing earlier points
|
| 45 |
+
|
| 46 |
+
### MCP Integration Requirements
|
| 47 |
+
Must use MCP (Model Context Protocol) servers as tools. Potential use cases:
|
| 48 |
+
- Web fetching for URL-based paper retrieval
|
| 49 |
+
- PDF processing and text extraction
|
| 50 |
+
- Document parsing and structured analysis
|
| 51 |
+
- Vector database operations if implementing RAG
|
| 52 |
+
|
| 53 |
+
### Character Design
|
| 54 |
+
- **Host**: Enthusiastic, asks clarifying questions, explains for general audience, keeps conversation flowing
|
| 55 |
+
- **Guest**: Technical expert/researcher persona, provides depth, answers questions with appropriate detail
|
| 56 |
+
|
| 57 |
+
## Key Technical Considerations
|
| 58 |
+
|
| 59 |
+
### PDF Processing
|
| 60 |
+
Academic PDFs have inconsistent formatting. Robust error handling is essential:
|
| 61 |
+
- Handle multi-column layouts
|
| 62 |
+
- Extract references and citations appropriately
|
| 63 |
+
- Deal with equations, figures, and tables
|
| 64 |
+
- Support various paper formats (arXiv, PubMed, conference papers)
|
| 65 |
+
|
| 66 |
+
### LLM Dialogue Generation
|
| 67 |
+
- Use system prompts to establish distinct character personalities
|
| 68 |
+
- Maintain conversation continuity (reference previous points)
|
| 69 |
+
- Balance technical accuracy with accessibility
|
| 70 |
+
- Target appropriate script length (aim for 5-15 minute podcasts)
|
| 71 |
+
|
| 72 |
+
### Text-to-Speech
|
| 73 |
+
Critical for user experience:
|
| 74 |
+
- Must have clearly distinct voices for Host vs Guest
|
| 75 |
+
- Audio quality must be intelligible
|
| 76 |
+
- Processing time should be reasonable (target: under 5 minutes total)
|
| 77 |
+
- Consider voice emotion/intonation for natural conversation
|
| 78 |
+
|
| 79 |
+
### Performance & UX
|
| 80 |
+
- Processing can take 2-5 minutes - show clear progress indicators
|
| 81 |
+
- Consider async operations for long-running tasks
|
| 82 |
+
- Implement graceful error handling (invalid URLs, corrupted PDFs, API failures)
|
| 83 |
+
- Optional: Allow script preview before audio generation
|
| 84 |
+
- Cache generated podcasts to avoid reprocessing
|
| 85 |
+
|
| 86 |
+
### Free/Open Source Priority
|
| 87 |
+
Budget is limited - prioritize freely available solutions:
|
| 88 |
+
- HuggingFace hosted models where possible
|
| 89 |
+
- Open source libraries (PyMuPDF, pdfplumber, etc.)
|
| 90 |
+
- Free tier APIs within rate limits
|
| 91 |
+
- Self-hosted components on HF Spaces infrastructure
|
| 92 |
+
|
| 93 |
+
## Gradio 6 Interface Requirements
|
| 94 |
+
|
| 95 |
+
The UI should be simple and intuitive:
|
| 96 |
+
- Input section: URL input field + PDF upload (mutually exclusive or combined)
|
| 97 |
+
- Processing section: Clear status messages and progress indicators
|
| 98 |
+
- Output section:
|
| 99 |
+
- Audio player for immediate listening
|
| 100 |
+
- Download buttons for audio file and transcript
|
| 101 |
+
- Display transcript with speaker labels
|
| 102 |
+
- Error messages should be user-friendly
|
| 103 |
+
|
| 104 |
+
## Submission Requirements Checklist
|
| 105 |
+
|
| 106 |
+
Required for valid submission:
|
| 107 |
+
- [ ] Working Gradio app deployed to HuggingFace Space
|
| 108 |
+
- [ ] Published under `MCP-1st-Birthday` organization (not personal profile)
|
| 109 |
+
- [ ] README.md includes `mcp-in-action-track-consumer` tag
|
| 110 |
+
- [ ] Demo video (1-5 minutes) showing project in action
|
| 111 |
+
- [ ] Social media post link (X/LinkedIn) in README
|
| 112 |
+
- [ ] Clear documentation of purpose, usage, and technical approach
|
| 113 |
+
- [ ] All dependencies in requirements.txt
|
| 114 |
+
- [ ] Team member HuggingFace usernames in README
|
| 115 |
+
|
| 116 |
+
## Judging Criteria Priority
|
| 117 |
+
|
| 118 |
+
When making design decisions, optimize for:
|
| 119 |
+
1. **Completeness**: All deliverables submitted
|
| 120 |
+
2. **Design/UI-UX**: Intuitive, polished interface
|
| 121 |
+
3. **Functionality**: Effective use of Gradio 6, MCPs, and agent capabilities
|
| 122 |
+
4. **Creativity**: Innovative approach to the problem
|
| 123 |
+
5. **Documentation**: Clear README and demo video
|
| 124 |
+
6. **Real-world impact**: Practical usefulness
|
| 125 |
+
|
| 126 |
+
## Critical Implementation Notes
|
| 127 |
+
|
| 128 |
+
### Agent vs API Chaining
|
| 129 |
+
This must demonstrate true agent behavior, not just API chaining:
|
| 130 |
+
- Show decision-making (e.g., determining which sections to emphasize)
|
| 131 |
+
- Demonstrate adaptive behavior (e.g., different strategies for different paper types)
|
| 132 |
+
- Use MCP servers as tools the agent reasons about, not just sequential calls
|
| 133 |
+
|
| 134 |
+
### Natural Dialogue Generation
|
| 135 |
+
Avoid robotic Q&A format:
|
| 136 |
+
- Use conversational connectors ("That's fascinating...", "Building on that point...")
|
| 137 |
+
- Include natural reactions and acknowledgments
|
| 138 |
+
- Vary sentence structure and length
|
| 139 |
+
- Use analogies and examples appropriate for general audience
|
| 140 |
+
- Host should ask genuine questions that guide the conversation
|
| 141 |
+
|
| 142 |
+
### Testing Strategy
|
| 143 |
+
Test with diverse paper types:
|
| 144 |
+
- Different fields (CS, biology, physics, social sciences)
|
| 145 |
+
- Various lengths (short letters vs full papers)
|
| 146 |
+
- Different repositories (arXiv, bioRxiv, PubMed)
|
| 147 |
+
- Papers with heavy math vs conceptual papers
|
| 148 |
+
|
| 149 |
+
## File Organization (Recommended)
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
papercast/
|
| 153 |
+
├── app.py # Main Gradio application
|
| 154 |
+
├── requirements.txt # Python dependencies
|
| 155 |
+
├── README.md # Project documentation (must include track tag)
|
| 156 |
+
├── agents/ # Agent logic and orchestration
|
| 157 |
+
├── mcp_servers/ # MCP server integrations
|
| 158 |
+
├── processing/ # PDF extraction and text processing
|
| 159 |
+
├── generation/ # Script and dialogue generation
|
| 160 |
+
├── synthesis/ # Text-to-speech audio generation
|
| 161 |
+
└── utils/ # Helper functions
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
## Known Constraints
|
| 165 |
+
|
| 166 |
+
- Deadline: November 30, 2025, 11:59 PM UTC
|
| 167 |
+
- Must be original work created November 14-30, 2025
|
| 168 |
+
- HuggingFace Spaces free tier (GPU available)
|
| 169 |
+
- Processing time target: under 5 minutes per paper
|
| 170 |
+
- All work must demonstrate MCP integration
|
| 171 |
+
|
| 172 |
+
## Reference Materials
|
| 173 |
+
|
| 174 |
+
- Project brief: `PAPERCAST_PROJECT_BRIEF.md`
|
| 175 |
+
- Gradio 6 docs: https://www.gradio.app/
|
| 176 |
+
- MCP documentation: https://huggingface.co/blog/gradio-mcp
|
| 177 |
+
- Hackathon page: https://huggingface.co/MCP-1st-Birthday
|
PAPERCAST_PROJECT_BRIEF.md
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PaperCast - Project Brief
|
| 2 |
+
|
| 3 |
+
## Hackathon Context
|
| 4 |
+
|
| 5 |
+
### Event Details
|
| 6 |
+
- **Name:** MCP's 1st Birthday Hackathon
|
| 7 |
+
- **Organizers:** Anthropic & Gradio
|
| 8 |
+
- **Duration:** November 14-30, 2025 (17 days, 3 weekends)
|
| 9 |
+
- **Total Prize Pool:** $21,000 USD + API Credits
|
| 10 |
+
- **Total Registrations:** 6100+
|
| 11 |
+
- **Platform:** HuggingFace Spaces
|
| 12 |
+
|
| 13 |
+
### Our Track: Track 2 - MCP in Action (Agents)
|
| 14 |
+
|
| 15 |
+
**Track Description:** Create complete AI agent Gradio applications that showcase autonomous reasoning, planning, and execution using MCP tools.
|
| 16 |
+
|
| 17 |
+
**Category:** Consumer Applications
|
| 18 |
+
- **Tag Required:** `mcp-in-action-track-consumer`
|
| 19 |
+
- **Prize Pool Per Category:**
|
| 20 |
+
- 🥇 First Place: $2,500 USD
|
| 21 |
+
- 🥈 Second Place: $1,000 USD
|
| 22 |
+
- 🥉 Third Place: $500 USD
|
| 23 |
+
|
| 24 |
+
### Judging Criteria (Priority Order)
|
| 25 |
+
1. **Completeness:** HF Space + Social media post + Documentation + Demo Video
|
| 26 |
+
2. **Design/Polished UI-UX:** How intuitive and easy-to-use the app is
|
| 27 |
+
3. **Functionality:** Effective use of Gradio 6, MCPs, Agentic capabilities
|
| 28 |
+
4. **Creativity:** Innovation in idea and implementation
|
| 29 |
+
5. **Documentation:** Clear communication in README and demo video
|
| 30 |
+
6. **Real-world impact:** Potential for practical usefulness
|
| 31 |
+
|
| 32 |
+
### Technical Requirements
|
| 33 |
+
- Must be published as HuggingFace Space under `MCP-1st-Birthday` organization
|
| 34 |
+
- Must be a Gradio application
|
| 35 |
+
- Must demonstrate autonomous agent behavior (planning, reasoning, execution)
|
| 36 |
+
- Must use MCP servers as tools
|
| 37 |
+
- Bonus points for: RAG, Context Engineering, advanced agent features
|
| 38 |
+
- All work must be original and created during Nov 14-30
|
| 39 |
+
|
| 40 |
+
### Submission Requirements
|
| 41 |
+
1. Working Gradio app deployed on HuggingFace Space
|
| 42 |
+
2. Track tag in README.md: `mcp-in-action-track-consumer`
|
| 43 |
+
3. Demo video (1-5 minutes) showing project in action
|
| 44 |
+
4. Social media post link (X/LinkedIn) about the project
|
| 45 |
+
5. Clear documentation of purpose, usage, and technical approach
|
| 46 |
+
|
| 47 |
+
### Available Credits (For Registered Participants)
|
| 48 |
+
- OpenAI: $25 for all participants
|
| 49 |
+
- HuggingFace: $25 for all participants
|
| 50 |
+
- Modal: $250 for all participants
|
| 51 |
+
- Nebius Token Factory: $50 for all participants
|
| 52 |
+
- ElevenLabs: $44 membership credits (for 5000 participants)
|
| 53 |
+
- SambaNova: $25 (for 1500 participants)
|
| 54 |
+
|
| 55 |
+
**Note:** Credits are provided to support hackathon development but availability timing may vary. Build with freely available alternatives as primary approach.
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## Project Vision: PaperCast
|
| 60 |
+
|
| 61 |
+
### The Problem
|
| 62 |
+
Research papers are incredibly valuable but present significant accessibility challenges:
|
| 63 |
+
- Dense, technical language requiring domain expertise
|
| 64 |
+
- Time-consuming to read (typically 30-60+ minutes per paper)
|
| 65 |
+
- Difficult to consume during daily activities (commute, exercise, chores)
|
| 66 |
+
- Creates barrier between cutting-edge research and broader audiences
|
| 67 |
+
|
| 68 |
+
### Our Solution
|
| 69 |
+
**PaperCast:** An AI agent that transforms research papers into engaging podcast-style conversations between a host and an expert, making complex research accessible through audio.
|
| 70 |
+
|
| 71 |
+
### Core Value Proposition
|
| 72 |
+
- **Input:** arXiv/PubMed URL or PDF upload
|
| 73 |
+
- **Process:** AI analyzes and generates natural dialogue between two speakers
|
| 74 |
+
- **Output:** Downloadable podcast audio file + transcript
|
| 75 |
+
- **Benefit:** Consume research during any activity, in accessible language
|
| 76 |
+
|
| 77 |
+
### Target Users
|
| 78 |
+
1. **Researchers/Academics:** Stay current with literature during commutes
|
| 79 |
+
2. **Students:** Understand papers more easily through conversational format
|
| 80 |
+
3. **Industry Professionals:** Keep up with relevant research without time investment
|
| 81 |
+
4. **Science Enthusiasts:** Access cutting-edge findings in digestible format
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
## Functional Requirements
|
| 86 |
+
|
| 87 |
+
### Input Methods (Dual Support)
|
| 88 |
+
1. **URL Input:** Accept links from research repositories
|
| 89 |
+
- arXiv (e.g., `https://arxiv.org/abs/2401.12345`)
|
| 90 |
+
- PubMed, bioRxiv, other common repositories
|
| 91 |
+
- Extract PDF from URL
|
| 92 |
+
|
| 93 |
+
2. **PDF Upload:** Direct file upload
|
| 94 |
+
- Support standard academic paper PDFs
|
| 95 |
+
- Handle various formatting styles
|
| 96 |
+
|
| 97 |
+
### Core Processing Pipeline
|
| 98 |
+
1. **Paper Extraction:** Extract text content from PDF or fetched document
|
| 99 |
+
2. **Analysis:** Identify key components (abstract, methodology, findings, conclusions)
|
| 100 |
+
3. **Script Generation:** Create natural dialogue between two speakers:
|
| 101 |
+
- **Host Character:** Enthusiastic, asks clarifying questions, explains for general audience
|
| 102 |
+
- **Guest Character:** The expert/researcher, provides technical depth
|
| 103 |
+
- Natural conversation flow with context awareness
|
| 104 |
+
- Appropriate analogies and examples for accessibility
|
| 105 |
+
4. **Audio Synthesis:** Convert dialogue to audio with distinct voices for each speaker
|
| 106 |
+
5. **Output Delivery:** Provide both transcript and audio file
|
| 107 |
+
|
| 108 |
+
### Agentic Behaviors to Demonstrate
|
| 109 |
+
- **Planning:** Analyze paper structure and determine conversation flow
|
| 110 |
+
- **Reasoning:** Identify which concepts need simplification or elaboration
|
| 111 |
+
- **Execution:** Orchestrate multiple steps (fetch → extract → analyze → generate → synthesize)
|
| 112 |
+
- **Context Management:** Maintain coherence across the dialogue
|
| 113 |
+
|
| 114 |
+
### User Experience Requirements
|
| 115 |
+
- Simple, clean interface (Gradio 6)
|
| 116 |
+
- Clear loading states during processing (can take 2-5 minutes)
|
| 117 |
+
- Preview of generated script before audio synthesis (optional)
|
| 118 |
+
- Audio player for immediate listening
|
| 119 |
+
- Download options for both audio and transcript
|
| 120 |
+
- Error handling for invalid URLs or corrupted PDFs
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## Technical Constraints & Considerations
|
| 125 |
+
|
| 126 |
+
### Platform & Framework
|
| 127 |
+
- **Primary Framework:** Gradio 6 (latest version)
|
| 128 |
+
- **Deployment:** HuggingFace Spaces (free tier with GPU)
|
| 129 |
+
- **Language:** Python
|
| 130 |
+
|
| 131 |
+
### MCP Integration
|
| 132 |
+
Must use MCP (Model Context Protocol) servers as tools. Potential MCP server use cases:
|
| 133 |
+
- Web fetching for URL-based paper retrieval
|
| 134 |
+
- PDF processing and text extraction
|
| 135 |
+
- Vector database operations for RAG
|
| 136 |
+
- Document parsing and analysis
|
| 137 |
+
|
| 138 |
+
### Architecture Considerations
|
| 139 |
+
- Process can be computationally expensive (LLM calls, TTS generation)
|
| 140 |
+
- Consider async operations and progress indicators
|
| 141 |
+
- Graceful degradation if services are unavailable
|
| 142 |
+
- Caching strategies to avoid reprocessing same papers
|
| 143 |
+
|
| 144 |
+
### Free/Open Source Priority
|
| 145 |
+
Since budget is limited, prioritize freely available solutions:
|
| 146 |
+
- Open source models and libraries
|
| 147 |
+
- Free tier APIs (within rate limits)
|
| 148 |
+
- HuggingFace ecosystem tools
|
| 149 |
+
- Self-hosted components where feasible
|
| 150 |
+
|
| 151 |
+
**Strategy:** Build core functionality with free tools, then optionally enhance with hackathon credits if/when available.
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## Success Metrics
|
| 156 |
+
|
| 157 |
+
### Minimum Viable Product (MVP)
|
| 158 |
+
- Accept arXiv URL or PDF upload ✓
|
| 159 |
+
- Extract paper text ✓
|
| 160 |
+
- Generate coherent dialogue script ✓
|
| 161 |
+
- Produce audio with 2 distinct speakers ✓
|
| 162 |
+
- Deployed and functional on HF Space ✓
|
| 163 |
+
|
| 164 |
+
### Enhanced Version (If Time Permits)
|
| 165 |
+
- Multiple paper repository support
|
| 166 |
+
- Customizable podcast length (5 min vs 15 min versions)
|
| 167 |
+
- Voice selection or style options
|
| 168 |
+
- Background music/intro/outro
|
| 169 |
+
- Batch processing for multiple papers
|
| 170 |
+
- Save history of generated podcasts
|
| 171 |
+
|
| 172 |
+
### Demo Quality Goals
|
| 173 |
+
- Generate a podcast in under 5 minutes
|
| 174 |
+
- Script should be natural and engaging (not robotic)
|
| 175 |
+
- Audio should be clearly intelligible
|
| 176 |
+
- Voices should be distinctly different
|
| 177 |
+
- Technical concepts appropriately explained
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
## Deliverables Checklist
|
| 182 |
+
|
| 183 |
+
### Code & Deployment
|
| 184 |
+
- [ ] Working Gradio application
|
| 185 |
+
- [ ] Deployed to HuggingFace Space under MCP-1st-Birthday org
|
| 186 |
+
- [ ] All dependencies in requirements.txt
|
| 187 |
+
- [ ] Clear code organization and comments
|
| 188 |
+
|
| 189 |
+
### Documentation (README.md)
|
| 190 |
+
- [ ] Project title and description
|
| 191 |
+
- [ ] Track tag: `mcp-in-action-track-consumer`
|
| 192 |
+
- [ ] How to use instructions
|
| 193 |
+
- [ ] Technical architecture overview
|
| 194 |
+
- [ ] Team member(s) HuggingFace usernames
|
| 195 |
+
- [ ] Demo video link (embedded)
|
| 196 |
+
- [ ] Social media post link
|
| 197 |
+
- [ ] Acknowledgment of tools/APIs used
|
| 198 |
+
|
| 199 |
+
### Demo Video (1-5 minutes)
|
| 200 |
+
- [ ] Problem introduction (30 sec)
|
| 201 |
+
- [ ] Solution overview (30 sec)
|
| 202 |
+
- [ ] Live demonstration (2-3 min)
|
| 203 |
+
- Show URL/PDF input
|
| 204 |
+
- Processing visualization
|
| 205 |
+
- Script preview
|
| 206 |
+
- Audio playback (30-60 sec sample)
|
| 207 |
+
- [ ] Technical highlights (30 sec)
|
| 208 |
+
- [ ] Impact statement (30 sec)
|
| 209 |
+
|
| 210 |
+
### Social Media Post
|
| 211 |
+
- [ ] Published on X (Twitter) or LinkedIn
|
| 212 |
+
- [ ] Includes project description
|
| 213 |
+
- [ ] Links to HuggingFace Space
|
| 214 |
+
- [ ] Relevant hashtags (#GradioHackathon #MCP)
|
| 215 |
+
- [ ] Demo video or GIF if possible
|
| 216 |
+
|
| 217 |
+
---
|
| 218 |
+
|
| 219 |
+
## Timeline Recommendation
|
| 220 |
+
|
| 221 |
+
### Week 1 (Nov 14-21): Foundation
|
| 222 |
+
- Set up project structure
|
| 223 |
+
- Implement PDF/URL input handling
|
| 224 |
+
- Build text extraction pipeline
|
| 225 |
+
- Initial dialogue generation experiments
|
| 226 |
+
|
| 227 |
+
### Week 2 (Nov 22-27): Core Features
|
| 228 |
+
- Refine script generation quality
|
| 229 |
+
- Implement audio synthesis
|
| 230 |
+
- Build Gradio interface
|
| 231 |
+
- Integrate MCP servers
|
| 232 |
+
- Testing and iteration
|
| 233 |
+
|
| 234 |
+
### Week 3 (Nov 28-30): Polish & Submit
|
| 235 |
+
- Nov 28: UI refinement, error handling
|
| 236 |
+
- Nov 29: Demo video creation, documentation
|
| 237 |
+
- Nov 30: Social media post, final testing, submission
|
| 238 |
+
|
| 239 |
+
---
|
| 240 |
+
|
| 241 |
+
## Strategic Notes
|
| 242 |
+
|
| 243 |
+
### Differentiation from Competitors
|
| 244 |
+
- Most participants will likely build generic chatbots or simple tools
|
| 245 |
+
- PaperCast is unique: specific use case, multimodal output, clear value
|
| 246 |
+
- The "podcast" angle is memorable and demo-able
|
| 247 |
+
- Strong real-world applicability (education/research)
|
| 248 |
+
|
| 249 |
+
### Competitive Advantages
|
| 250 |
+
1. **Clear use case:** Not just "another AI chat app"
|
| 251 |
+
2. **Multimodal:** Text → conversational audio (less competition in this category)
|
| 252 |
+
3. **Viral potential:** Researchers will want to share their papers as podcasts
|
| 253 |
+
4. **Demo appeal:** Juries can literally listen to the output
|
| 254 |
+
|
| 255 |
+
### Risk Mitigation
|
| 256 |
+
- **TTS Quality:** Critical for user experience - explore multiple options
|
| 257 |
+
- **Script Coherence:** May need iterative prompt engineering
|
| 258 |
+
- **Processing Time:** Set realistic expectations, show progress
|
| 259 |
+
- **PDF Parsing:** Academic PDFs have inconsistent formatting - robust error handling needed
|
| 260 |
+
|
| 261 |
+
### Bonus Opportunities
|
| 262 |
+
- **Modal Innovation Award:** If we use Modal for compute ($2,500)
|
| 263 |
+
- **Google Gemini Award:** If we use Gemini API ($15K in credits)
|
| 264 |
+
- **Blaxel Award:** If we use Blaxel in submission ($2,500)
|
| 265 |
+
- **Community Choice:** Maximize social engagement
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## Resources & Links
|
| 270 |
+
|
| 271 |
+
### Essential Links
|
| 272 |
+
- **Hackathon Page:** https://huggingface.co/MCP-1st-Birthday
|
| 273 |
+
- **Discord:** https://discord.gg/fveShqytyh (Channel: #agents-mcp-hackathon-winter25🏆)
|
| 274 |
+
- **Gradio 6 Docs:** https://www.gradio.app/
|
| 275 |
+
- **MCP Documentation:** https://huggingface.co/blog/gradio-mcp
|
| 276 |
+
- **Submission Deadline:** November 30, 2025, 11:59 PM UTC
|
| 277 |
+
|
| 278 |
+
### Inspirational Examples (June 2025 Hackathon)
|
| 279 |
+
Look at previous submissions for quality benchmarks and presentation style.
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## Critical Reminders
|
| 284 |
+
|
| 285 |
+
1. **Track Tag is MANDATORY:** `mcp-in-action-track-consumer` in README.md
|
| 286 |
+
2. **Organization Requirement:** Must publish under MCP-1st-Birthday, not personal profile
|
| 287 |
+
3. **Social Media is REQUIRED:** Submission invalid without it
|
| 288 |
+
4. **Demo Video is REQUIRED:** Judges won't evaluate without seeing it
|
| 289 |
+
5. **Original Work Only:** Everything must be built Nov 14-30, 2025
|
| 290 |
+
6. **MCP Integration Required:** Must demonstrate MCP server usage
|
| 291 |
+
7. **Agent Behavior Required:** Must show planning, reasoning, execution
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## Open Questions for Implementation
|
| 296 |
+
|
| 297 |
+
These are decisions that should be made during development based on experimentation and available resources:
|
| 298 |
+
|
| 299 |
+
1. **LLM Selection:** Which model for dialogue generation? (Consider: quality, cost, speed, availability)
|
| 300 |
+
2. **TTS System:** Which text-to-speech solution? (Consider: voice quality, speaker diversity, processing time, cost)
|
| 301 |
+
3. **PDF Processing:** Which library/approach? (PyMuPDF, pdfplumber, etc.)
|
| 302 |
+
4. **MCP Architecture:** Which specific MCP servers to integrate and how?
|
| 303 |
+
5. **RAG Strategy:** Do we need vector embeddings? Which embedding model?
|
| 304 |
+
6. **Script Length:** Target word count for optimal podcast length?
|
| 305 |
+
7. **Caching:** Should we cache generated podcasts? How?
|
| 306 |
+
8. **Voice Personalities:** How to prompt for consistent host/guest characteristics?
|
| 307 |
+
|
| 308 |
+
---
|
| 309 |
+
|
| 310 |
+
## Success Definition
|
| 311 |
+
|
| 312 |
+
**We'll know we succeeded when:**
|
| 313 |
+
- A user can input a real arXiv paper and get a listenable podcast in under 5 minutes
|
| 314 |
+
- The dialogue sounds natural, not like a robotic Q&A
|
| 315 |
+
- Technical concepts are explained accessibly
|
| 316 |
+
- The demo video makes judges go "wow, I want to use this"
|
| 317 |
+
- The project demonstrates clear agent capabilities (not just API chaining)
|
| 318 |
+
- We're proud to share it publicly
|
| 319 |
+
|
| 320 |
+
**This is more than a hackathon submission - it's a tool that could genuinely help people access knowledge more easily.**
|
| 321 |
+
|
| 322 |
+
---
|
| 323 |
+
|
| 324 |
+
## Final Note
|
| 325 |
+
|
| 326 |
+
This document provides context and requirements, but implementation decisions are yours to make. Focus on:
|
| 327 |
+
- Building something that works reliably
|
| 328 |
+
- Creating an experience that delights users
|
| 329 |
+
- Demonstrating thoughtful agent design
|
| 330 |
+
- Shipping on time with polish
|
| 331 |
+
|
| 332 |
+
Good luck! 🎙️🚀
|
README.md
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PaperCast 🎙️
|
| 2 |
+
|
| 3 |
+
Transform research papers into engaging podcast-style conversations.
|
| 4 |
+
|
| 5 |
+
**Track:** `mcp-in-action-track-consumer`
|
| 6 |
+
|
| 7 |
+
## Overview
|
| 8 |
+
|
| 9 |
+
PaperCast is an AI agent application that converts academic research papers into accessible, engaging podcast-style audio conversations between a host and an expert. Simply provide an arXiv URL or upload a PDF, and PaperCast will generate a natural dialogue that explains the research in an approachable way.
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
- 📄 **Multiple Input Methods**: Accept arXiv URLs or direct PDF uploads
|
| 14 |
+
- 🤖 **Autonomous Agent**: Intelligent analysis and conversation planning
|
| 15 |
+
- 🎭 **Natural Dialogue**: Two distinct speakers (Host and Guest) with conversational flow
|
| 16 |
+
- 🔊 **High-Quality Audio**: Clear, distinct voices for each speaker
|
| 17 |
+
- 📝 **Complete Transcripts**: Download both audio and text versions
|
| 18 |
+
- ⚡ **Fast Processing**: Generate podcasts in under 5 minutes
|
| 19 |
+
|
| 20 |
+
## How It Works
|
| 21 |
+
|
| 22 |
+
1. **Input**: Provide an arXiv URL or upload a research paper PDF
|
| 23 |
+
2. **Analysis**: AI agent analyzes paper structure and identifies key concepts
|
| 24 |
+
3. **Script Generation**: Creates natural dialogue between host and expert
|
| 25 |
+
4. **Audio Synthesis**: Converts script to audio with distinct voices
|
| 26 |
+
5. **Output**: Download podcast audio and transcript
|
| 27 |
+
|
| 28 |
+
## Technical Stack
|
| 29 |
+
|
| 30 |
+
- **Framework**: Gradio 6
|
| 31 |
+
- **AI Agent**: Autonomous reasoning with MCP integration
|
| 32 |
+
- **LLM**: Phi-4-mini-instruct / VibeThinker-1.5B
|
| 33 |
+
- **TTS**: Supertone/Maya for distinct voices
|
| 34 |
+
- **PDF Processing**: PyMuPDF, pdfplumber
|
| 35 |
+
- **Platform**: HuggingFace Spaces
|
| 36 |
+
|
| 37 |
+
## Installation
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
pip install -r requirements.txt
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Usage
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
python app.py
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
Then open your browser to the provided URL (typically `http://localhost:7860`).
|
| 50 |
+
|
| 51 |
+
## Project Structure
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
papercast/
|
| 55 |
+
├── app.py # Main Gradio application
|
| 56 |
+
├── requirements.txt # Python dependencies
|
| 57 |
+
├── README.md # This file
|
| 58 |
+
├── agents/ # Agent logic and orchestration
|
| 59 |
+
├── mcp_servers/ # MCP server integrations
|
| 60 |
+
├── processing/ # PDF extraction and text processing
|
| 61 |
+
├── generation/ # Script and dialogue generation
|
| 62 |
+
├── synthesis/ # Text-to-speech audio generation
|
| 63 |
+
└── utils/ # Helper functions
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## Team
|
| 67 |
+
|
| 68 |
+
- [Team Member HF Username]
|
| 69 |
+
|
| 70 |
+
## Demo
|
| 71 |
+
|
| 72 |
+
[Demo video link will be added here]
|
| 73 |
+
|
| 74 |
+
## Social Media
|
| 75 |
+
|
| 76 |
+
[Social media post link will be added here]
|
| 77 |
+
|
| 78 |
+
## Acknowledgments
|
| 79 |
+
|
| 80 |
+
Built for the MCP 1st Birthday Hackathon (Track 2: MCP in Action - Consumer).
|
| 81 |
+
|
| 82 |
+
Special thanks to:
|
| 83 |
+
- Anthropic & Gradio for organizing the hackathon
|
| 84 |
+
- HuggingFace for hosting infrastructure
|
| 85 |
+
- Open source communities for TTS and LLM models
|
| 86 |
+
|
| 87 |
+
## License
|
| 88 |
+
|
| 89 |
+
[To be determined]
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
**Hackathon:** MCP's 1st Birthday
|
| 94 |
+
**Category:** Consumer Applications
|
| 95 |
+
**Organization:** MCP-1st-Birthday
|
agents/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Agent logic and orchestration for PaperCast"""
|
agents/podcast_agent.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from generation.script_generator import get_generator
|
| 4 |
+
from processing.pdf_reader import extract_text_from_pdf
|
| 5 |
+
from processing.url_fetcher import fetch_paper_from_url
|
| 6 |
+
from synthesis.tts_engine import get_tts_engine
|
| 7 |
+
from utils.config import (
|
| 8 |
+
DEMO_INFERENCE_KEY,
|
| 9 |
+
DEMO_INFERENCE_URL,
|
| 10 |
+
DEMO_MODE,
|
| 11 |
+
DEMO_MODEL,
|
| 12 |
+
DEMO_TTS_KEY,
|
| 13 |
+
MAX_CONTEXT_CHARS,
|
| 14 |
+
)
|
| 15 |
+
from utils.history import save_to_history
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PodcastAgent:
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
provider_mode="demo",
|
| 22 |
+
own_base_url=None,
|
| 23 |
+
own_api_key=None,
|
| 24 |
+
own_model=None,
|
| 25 |
+
openai_key=None,
|
| 26 |
+
openai_model=None,
|
| 27 |
+
tts_provider="edge-tts",
|
| 28 |
+
elevenlabs_key=None,
|
| 29 |
+
host_voice=None,
|
| 30 |
+
guest_voice=None,
|
| 31 |
+
max_tokens=None,
|
| 32 |
+
):
|
| 33 |
+
self.logs = []
|
| 34 |
+
|
| 35 |
+
# If demo mode is enabled, override all settings with demo credentials
|
| 36 |
+
if DEMO_MODE:
|
| 37 |
+
self.provider_mode = "demo"
|
| 38 |
+
self.own_base_url = DEMO_INFERENCE_URL
|
| 39 |
+
self.own_api_key = DEMO_INFERENCE_KEY
|
| 40 |
+
self.own_model = DEMO_MODEL
|
| 41 |
+
self.openai_key = None
|
| 42 |
+
self.openai_model = None
|
| 43 |
+
self.tts_provider = "edge-tts" # Always use Edge-TTS in demo mode
|
| 44 |
+
self.elevenlabs_key = None
|
| 45 |
+
self.host_voice = host_voice
|
| 46 |
+
self.guest_voice = guest_voice
|
| 47 |
+
else:
|
| 48 |
+
self.provider_mode = provider_mode # "own_inference" or "openai"
|
| 49 |
+
self.own_base_url = own_base_url
|
| 50 |
+
self.own_api_key = own_api_key
|
| 51 |
+
self.own_model = own_model
|
| 52 |
+
self.openai_key = openai_key
|
| 53 |
+
self.openai_model = openai_model
|
| 54 |
+
self.tts_provider = tts_provider
|
| 55 |
+
self.elevenlabs_key = elevenlabs_key
|
| 56 |
+
self.host_voice = host_voice
|
| 57 |
+
self.guest_voice = guest_voice
|
| 58 |
+
|
| 59 |
+
self.max_tokens = max_tokens
|
| 60 |
+
|
| 61 |
+
def log(self, message):
|
| 62 |
+
timestamp = time.strftime("%H:%M:%S")
|
| 63 |
+
entry = f"[{timestamp}] {message}"
|
| 64 |
+
print(entry)
|
| 65 |
+
self.logs.append(entry)
|
| 66 |
+
return entry
|
| 67 |
+
|
| 68 |
+
def process(self, url: str = None, pdf_file=None):
|
| 69 |
+
"""
|
| 70 |
+
Orchestrates the conversion from URL or uploaded PDF to Podcast.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
url: Paper URL (arXiv or medRxiv)
|
| 74 |
+
pdf_file: Uploaded PDF file object
|
| 75 |
+
"""
|
| 76 |
+
# Determine source
|
| 77 |
+
if pdf_file:
|
| 78 |
+
yield self.log(
|
| 79 |
+
f"Received uploaded PDF: {pdf_file.name if hasattr(pdf_file, 'name') else 'file'}"
|
| 80 |
+
)
|
| 81 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
|
| 82 |
+
source_ref = "Uploaded PDF"
|
| 83 |
+
elif url:
|
| 84 |
+
yield self.log(f"Received request for URL: {url}")
|
| 85 |
+
|
| 86 |
+
# Step 1: Fetch Paper
|
| 87 |
+
yield self.log("Thinking: I need to download the paper first.")
|
| 88 |
+
yield self.log(f"Tool Call: fetch_paper({url})")
|
| 89 |
+
pdf_path = fetch_paper_from_url(url)
|
| 90 |
+
if not pdf_path:
|
| 91 |
+
yield self.log("Error: Failed to download paper.")
|
| 92 |
+
return None, "\n".join(self.logs)
|
| 93 |
+
yield self.log(f"Paper downloaded to: {pdf_path}")
|
| 94 |
+
source_ref = url
|
| 95 |
+
else:
|
| 96 |
+
yield self.log(
|
| 97 |
+
"Error: No input provided. Please provide either a URL or upload a PDF."
|
| 98 |
+
)
|
| 99 |
+
return None, "\n".join(self.logs)
|
| 100 |
+
|
| 101 |
+
# Step 2: Read PDF
|
| 102 |
+
yield self.log("Thinking: Now I need to extract the text content.")
|
| 103 |
+
yield self.log(f"Tool Call: read_pdf({pdf_path})")
|
| 104 |
+
text = extract_text_from_pdf(pdf_path)
|
| 105 |
+
if not text:
|
| 106 |
+
yield self.log("Error: Failed to extract text.")
|
| 107 |
+
return None, self.logs
|
| 108 |
+
yield self.log(f"Extracted {len(text)} characters.")
|
| 109 |
+
|
| 110 |
+
# Step 3: Generate Script
|
| 111 |
+
yield self.log(
|
| 112 |
+
"Thinking: The text is ready. I will now generate a podcast script using the LLM."
|
| 113 |
+
)
|
| 114 |
+
if self.provider_mode == "demo":
|
| 115 |
+
yield self.log("Using Demo Inference")
|
| 116 |
+
elif self.provider_mode == "own_inference":
|
| 117 |
+
yield self.log(f"Using Own Inference: {self.own_base_url}")
|
| 118 |
+
else:
|
| 119 |
+
yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})")
|
| 120 |
+
yield self.log("Tool Call: generate_script(...)")
|
| 121 |
+
generator = get_generator(
|
| 122 |
+
provider_mode=self.provider_mode,
|
| 123 |
+
own_base_url=self.own_base_url,
|
| 124 |
+
own_api_key=self.own_api_key,
|
| 125 |
+
own_model=self.own_model,
|
| 126 |
+
openai_key=self.openai_key,
|
| 127 |
+
openai_model=self.openai_model,
|
| 128 |
+
max_tokens=self.max_tokens,
|
| 129 |
+
)
|
| 130 |
+
script = generator.generate_podcast_script(text)
|
| 131 |
+
if not script:
|
| 132 |
+
yield self.log("Error: Failed to generate script.")
|
| 133 |
+
return None, self.logs
|
| 134 |
+
yield self.log(f"Generated script with {len(script)} dialogue turns.")
|
| 135 |
+
|
| 136 |
+
# Step 4: Synthesize Audio
|
| 137 |
+
yield self.log("Thinking: The script looks good. Sending it to the TTS engine.")
|
| 138 |
+
if self.tts_provider == "edge-tts":
|
| 139 |
+
yield self.log("Using Edge-TTS (Microsoft, free)")
|
| 140 |
+
elif self.tts_provider == "elevenlabs":
|
| 141 |
+
if self.elevenlabs_key:
|
| 142 |
+
yield self.log("Using custom ElevenLabs API key")
|
| 143 |
+
else:
|
| 144 |
+
yield self.log("Using demo ElevenLabs key")
|
| 145 |
+
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 146 |
+
tts = get_tts_engine(
|
| 147 |
+
tts_provider=self.tts_provider,
|
| 148 |
+
custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None,
|
| 149 |
+
host_voice=self.host_voice,
|
| 150 |
+
guest_voice=self.guest_voice
|
| 151 |
+
)
|
| 152 |
+
audio_path = tts.synthesize_dialogue(script)
|
| 153 |
+
|
| 154 |
+
if not audio_path:
|
| 155 |
+
yield self.log("Error: Failed to synthesize audio.")
|
| 156 |
+
return None, self.logs
|
| 157 |
+
|
| 158 |
+
yield self.log(f"Podcast generated successfully at: {audio_path}")
|
| 159 |
+
|
| 160 |
+
# Save to history
|
| 161 |
+
save_to_history(source_ref, audio_path, len(script))
|
| 162 |
+
yield self.log("✓ Saved to history")
|
| 163 |
+
|
| 164 |
+
return audio_path, "\n".join(self.logs)
|
| 165 |
+
|
| 166 |
+
def process_multiple(self, urls: list = None, pdf_files: list = None):
|
| 167 |
+
"""
|
| 168 |
+
Orchestrates the conversion from multiple URLs or PDFs to a single comprehensive Podcast.
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
urls: List of paper URLs (arXiv or medRxiv)
|
| 172 |
+
pdf_files: List of uploaded PDF file objects
|
| 173 |
+
"""
|
| 174 |
+
all_texts = []
|
| 175 |
+
source_refs = []
|
| 176 |
+
total_chars = 0
|
| 177 |
+
|
| 178 |
+
# Process URLs
|
| 179 |
+
if urls:
|
| 180 |
+
yield self.log(f"Received {len(urls)} URLs to process.")
|
| 181 |
+
yield self.log(f"Context limit: {MAX_CONTEXT_CHARS:,} characters")
|
| 182 |
+
|
| 183 |
+
for i, url in enumerate(urls, 1):
|
| 184 |
+
yield self.log(f"\n=== Processing Paper {i}/{len(urls)} ===")
|
| 185 |
+
yield self.log(f"URL: {url}")
|
| 186 |
+
|
| 187 |
+
# Step 1: Fetch Paper
|
| 188 |
+
yield self.log(f"Tool Call: fetch_paper({url})")
|
| 189 |
+
pdf_path = fetch_paper_from_url(url)
|
| 190 |
+
if not pdf_path:
|
| 191 |
+
yield self.log(f"Warning: Failed to download paper {i}, skipping.")
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
yield self.log(f"Paper {i} downloaded successfully.")
|
| 195 |
+
|
| 196 |
+
# Step 2: Read PDF
|
| 197 |
+
yield self.log(f"Tool Call: read_pdf({pdf_path})")
|
| 198 |
+
text = extract_text_from_pdf(pdf_path)
|
| 199 |
+
if not text:
|
| 200 |
+
yield self.log(
|
| 201 |
+
f"Warning: Failed to extract text from paper {i}, skipping."
|
| 202 |
+
)
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
text_length = len(text)
|
| 206 |
+
yield self.log(f"Extracted {text_length:,} characters from paper {i}.")
|
| 207 |
+
|
| 208 |
+
# Check context limit
|
| 209 |
+
if total_chars + text_length > MAX_CONTEXT_CHARS:
|
| 210 |
+
yield self.log(f"⚠️ Context limit reached!")
|
| 211 |
+
yield self.log(
|
| 212 |
+
f"Current total: {total_chars:,} chars + Paper {i}: {text_length:,} chars = {total_chars + text_length:,} chars"
|
| 213 |
+
)
|
| 214 |
+
yield self.log(f"Maximum allowed: {MAX_CONTEXT_CHARS:,} chars")
|
| 215 |
+
yield self.log(
|
| 216 |
+
f"Stopping at {len(all_texts)} papers. Remaining papers will be skipped."
|
| 217 |
+
)
|
| 218 |
+
break
|
| 219 |
+
|
| 220 |
+
all_texts.append(f"=== PAPER {i} ===\n{text}\n")
|
| 221 |
+
source_refs.append(url)
|
| 222 |
+
total_chars += text_length
|
| 223 |
+
yield self.log(
|
| 224 |
+
f"✓ Paper {i} added. Total context: {total_chars:,} chars ({(total_chars / MAX_CONTEXT_CHARS) * 100:.1f}% of limit)"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Process PDFs
|
| 228 |
+
elif pdf_files:
|
| 229 |
+
yield self.log(f"Received {len(pdf_files)} PDF files to process.")
|
| 230 |
+
yield self.log(f"Context limit: {MAX_CONTEXT_CHARS:,} characters")
|
| 231 |
+
|
| 232 |
+
for i, pdf_file in enumerate(pdf_files, 1):
|
| 233 |
+
yield self.log(f"\n=== Processing PDF {i}/{len(pdf_files)} ===")
|
| 234 |
+
pdf_name = pdf_file.name if hasattr(pdf_file, "name") else f"file_{i}"
|
| 235 |
+
yield self.log(f"File: {pdf_name}")
|
| 236 |
+
|
| 237 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
|
| 238 |
+
|
| 239 |
+
# Read PDF
|
| 240 |
+
yield self.log(f"Tool Call: read_pdf({pdf_path})")
|
| 241 |
+
text = extract_text_from_pdf(pdf_path)
|
| 242 |
+
if not text:
|
| 243 |
+
yield self.log(
|
| 244 |
+
f"Warning: Failed to extract text from PDF {i}, skipping."
|
| 245 |
+
)
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
text_length = len(text)
|
| 249 |
+
yield self.log(f"Extracted {text_length:,} characters from PDF {i}.")
|
| 250 |
+
|
| 251 |
+
# Check context limit
|
| 252 |
+
if total_chars + text_length > MAX_CONTEXT_CHARS:
|
| 253 |
+
yield self.log(f"⚠️ Context limit reached!")
|
| 254 |
+
yield self.log(
|
| 255 |
+
f"Current total: {total_chars:,} chars + PDF {i}: {text_length:,} chars = {total_chars + text_length:,} chars"
|
| 256 |
+
)
|
| 257 |
+
yield self.log(f"Maximum allowed: {MAX_CONTEXT_CHARS:,} chars")
|
| 258 |
+
yield self.log(
|
| 259 |
+
f"Stopping at {len(all_texts)} files. Remaining PDFs will be skipped."
|
| 260 |
+
)
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
all_texts.append(f"=== PAPER {i} ===\n{text}\n")
|
| 264 |
+
source_refs.append(f"Uploaded PDF {i}")
|
| 265 |
+
total_chars += text_length
|
| 266 |
+
yield self.log(
|
| 267 |
+
f"✓ PDF {i} added. Total context: {total_chars:,} chars ({(total_chars / MAX_CONTEXT_CHARS) * 100:.1f}% of limit)"
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
if not all_texts:
|
| 271 |
+
yield self.log("Error: No papers were successfully processed.")
|
| 272 |
+
return None, "\n".join(self.logs)
|
| 273 |
+
|
| 274 |
+
# Combine all texts
|
| 275 |
+
yield self.log(f"\n✓ Successfully processed {len(all_texts)} papers")
|
| 276 |
+
yield self.log(
|
| 277 |
+
f"Total context: {total_chars:,} characters ({(total_chars / MAX_CONTEXT_CHARS) * 100:.1f}% of limit)"
|
| 278 |
+
)
|
| 279 |
+
yield self.log(
|
| 280 |
+
f"Thinking: Now I'll combine all papers into a comprehensive podcast script."
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
combined_text = "\n\n".join(all_texts)
|
| 284 |
+
|
| 285 |
+
# Step 3: Generate Comprehensive Script
|
| 286 |
+
yield self.log(
|
| 287 |
+
"\nThinking: Creating a comprehensive podcast script covering all papers."
|
| 288 |
+
)
|
| 289 |
+
if self.provider_mode == "demo":
|
| 290 |
+
yield self.log("Using Demo Inference")
|
| 291 |
+
elif self.provider_mode == "own_inference":
|
| 292 |
+
yield self.log(f"Using Own Inference: {self.own_base_url}")
|
| 293 |
+
else:
|
| 294 |
+
yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})")
|
| 295 |
+
yield self.log("Tool Call: generate_script(...)")
|
| 296 |
+
generator = get_generator(
|
| 297 |
+
provider_mode=self.provider_mode,
|
| 298 |
+
own_base_url=self.own_base_url,
|
| 299 |
+
own_api_key=self.own_api_key,
|
| 300 |
+
own_model=self.own_model,
|
| 301 |
+
openai_key=self.openai_key,
|
| 302 |
+
openai_model=self.openai_model,
|
| 303 |
+
max_tokens=self.max_tokens,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# Add instruction for multi-paper script
|
| 307 |
+
multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
|
| 308 |
+
script = generator.generate_podcast_script(multi_paper_prompt)
|
| 309 |
+
|
| 310 |
+
if not script:
|
| 311 |
+
yield self.log("Error: Failed to generate script.")
|
| 312 |
+
return None, self.logs
|
| 313 |
+
|
| 314 |
+
yield self.log(
|
| 315 |
+
f"Generated comprehensive script with {len(script)} dialogue turns."
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Step 4: Synthesize Audio
|
| 319 |
+
yield self.log(
|
| 320 |
+
"\nThinking: The script looks good. Sending it to the TTS engine."
|
| 321 |
+
)
|
| 322 |
+
if self.tts_provider == "edge-tts":
|
| 323 |
+
yield self.log("Using Edge-TTS (Microsoft, free)")
|
| 324 |
+
elif self.tts_provider == "elevenlabs":
|
| 325 |
+
if self.elevenlabs_key:
|
| 326 |
+
yield self.log("Using custom ElevenLabs API key")
|
| 327 |
+
else:
|
| 328 |
+
yield self.log("Using demo ElevenLabs key")
|
| 329 |
+
yield self.log("Tool Call: synthesize_podcast(...)")
|
| 330 |
+
tts = get_tts_engine(
|
| 331 |
+
tts_provider=self.tts_provider,
|
| 332 |
+
custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None,
|
| 333 |
+
host_voice=self.host_voice,
|
| 334 |
+
guest_voice=self.guest_voice
|
| 335 |
+
)
|
| 336 |
+
audio_path = tts.synthesize_dialogue(script)
|
| 337 |
+
|
| 338 |
+
if not audio_path:
|
| 339 |
+
yield self.log("Error: Failed to synthesize audio.")
|
| 340 |
+
return None, self.logs
|
| 341 |
+
|
| 342 |
+
yield self.log(f"Podcast generated successfully at: {audio_path}")
|
| 343 |
+
|
| 344 |
+
# Save to history
|
| 345 |
+
source_ref = f"Multiple papers: {', '.join(source_refs[:3])}{'...' if len(source_refs) > 3 else ''}"
|
| 346 |
+
save_to_history(source_ref, audio_path, len(script))
|
| 347 |
+
yield self.log("✓ Saved to history")
|
| 348 |
+
|
| 349 |
+
return audio_path, "\n".join(self.logs)
|
app.py
ADDED
|
@@ -0,0 +1,1203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from agents.podcast_agent import PodcastAgent
|
| 7 |
+
from synthesis.tts_engine import EDGE_TTS_VOICES, ELEVENLABS_VOICES
|
| 8 |
+
from utils.config import (
|
| 9 |
+
DEMO_INFERENCE_KEY,
|
| 10 |
+
DEMO_INFERENCE_URL,
|
| 11 |
+
DEMO_MODE,
|
| 12 |
+
DEMO_MODEL,
|
| 13 |
+
DEMO_TTS_KEY,
|
| 14 |
+
OUTPUT_DIR,
|
| 15 |
+
SCRIPT_GENERATION_MODEL,
|
| 16 |
+
)
|
| 17 |
+
from utils.history import get_history_items, load_history
|
| 18 |
+
|
| 19 |
+
# Ensure output directory exists
|
| 20 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def validate_settings_for_generation(
|
| 24 |
+
llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key
|
| 25 |
+
):
|
| 26 |
+
"""
|
| 27 |
+
Validate user settings for podcast generation in non-demo mode.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
tuple: (is_valid, error_message)
|
| 31 |
+
"""
|
| 32 |
+
# Skip validation if in demo mode
|
| 33 |
+
if DEMO_MODE:
|
| 34 |
+
return True, ""
|
| 35 |
+
|
| 36 |
+
errors = []
|
| 37 |
+
|
| 38 |
+
# Validate LLM settings
|
| 39 |
+
if llm_choice == "Own Inference":
|
| 40 |
+
if not own_base_url:
|
| 41 |
+
errors.append("❌ **Own Inference**: Base URL is required")
|
| 42 |
+
elif not (
|
| 43 |
+
own_base_url.startswith("http://") or own_base_url.startswith("https://")
|
| 44 |
+
):
|
| 45 |
+
errors.append(
|
| 46 |
+
"❌ **Own Inference**: Base URL must start with http:// or https://"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
elif llm_choice == "OpenAI":
|
| 50 |
+
if not openai_key:
|
| 51 |
+
errors.append("❌ **OpenAI**: API key is required")
|
| 52 |
+
elif not openai_key.startswith("sk-"):
|
| 53 |
+
errors.append("❌ **OpenAI**: API key must start with 'sk-'")
|
| 54 |
+
|
| 55 |
+
# Validate TTS settings
|
| 56 |
+
if tts_provider == "elevenlabs":
|
| 57 |
+
if not elevenlabs_key:
|
| 58 |
+
errors.append("❌ **ElevenLabs**: API key is required")
|
| 59 |
+
elif not elevenlabs_key.startswith("sk_"):
|
| 60 |
+
errors.append("❌ **ElevenLabs**: API key must start with 'sk_'")
|
| 61 |
+
# Edge-TTS doesn't require any validation (it's free)
|
| 62 |
+
|
| 63 |
+
if errors:
|
| 64 |
+
return False, "\n".join(errors)
|
| 65 |
+
|
| 66 |
+
return True, ""
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_stats():
|
| 70 |
+
"""Get statistics"""
|
| 71 |
+
history = load_history()
|
| 72 |
+
total = len(history)
|
| 73 |
+
return f"🚀 **Total Podcasts: {total}**"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate_progress_indicator(current_step):
|
| 77 |
+
"""Generate visual progress indicator"""
|
| 78 |
+
steps = [
|
| 79 |
+
{"name": "Fetching Paper", "icon": "📥"},
|
| 80 |
+
{"name": "Extracting Text", "icon": "📄"},
|
| 81 |
+
{"name": "Generating Script", "icon": "✍️"},
|
| 82 |
+
{"name": "Synthesizing Audio", "icon": "🎙️"},
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
progress_html = "<div style='padding: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin: 10px 0;'>"
|
| 86 |
+
progress_html += "<div style='display: flex; justify-content: space-between; align-items: center;'>"
|
| 87 |
+
|
| 88 |
+
for i, step in enumerate(steps):
|
| 89 |
+
step_num = i + 1
|
| 90 |
+
if step_num < current_step:
|
| 91 |
+
# Completed step
|
| 92 |
+
status_color = "#4ade80" # Green
|
| 93 |
+
icon = "✅"
|
| 94 |
+
elif step_num == current_step:
|
| 95 |
+
# Current step
|
| 96 |
+
status_color = "#fbbf24" # Yellow
|
| 97 |
+
icon = "⏳"
|
| 98 |
+
else:
|
| 99 |
+
# Pending step
|
| 100 |
+
status_color = "#9ca3af" # Gray
|
| 101 |
+
icon = "⏸️"
|
| 102 |
+
|
| 103 |
+
progress_html += f"""
|
| 104 |
+
<div style='text-align: center; flex: 1;'>
|
| 105 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>{icon}</div>
|
| 106 |
+
<div style='color: white; font-weight: bold; font-size: 0.9em;'>{step['name']}</div>
|
| 107 |
+
<div style='color: {status_color}; font-size: 0.8em; margin-top: 3px;'>Step {step_num}/4</div>
|
| 108 |
+
</div>
|
| 109 |
+
"""
|
| 110 |
+
|
| 111 |
+
progress_html += "</div></div>"
|
| 112 |
+
return progress_html
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def validated_generate_agent(
|
| 116 |
+
url,
|
| 117 |
+
pdf_file,
|
| 118 |
+
advanced_mode,
|
| 119 |
+
multi_urls,
|
| 120 |
+
multi_pdfs,
|
| 121 |
+
user_llm_choice,
|
| 122 |
+
user_own_base_url,
|
| 123 |
+
user_own_api_key,
|
| 124 |
+
user_own_model,
|
| 125 |
+
user_openai_key,
|
| 126 |
+
user_openai_model,
|
| 127 |
+
user_tts_provider,
|
| 128 |
+
user_elevenlabs_key,
|
| 129 |
+
user_host_voice,
|
| 130 |
+
user_guest_voice,
|
| 131 |
+
user_podcast_length,
|
| 132 |
+
progress=gr.Progress(),
|
| 133 |
+
):
|
| 134 |
+
"""Validate settings and run podcast generation"""
|
| 135 |
+
|
| 136 |
+
# Validate settings first
|
| 137 |
+
is_valid, error_message = validate_settings_for_generation(
|
| 138 |
+
user_llm_choice,
|
| 139 |
+
user_own_base_url,
|
| 140 |
+
user_own_api_key,
|
| 141 |
+
user_openai_key,
|
| 142 |
+
user_tts_provider,
|
| 143 |
+
user_elevenlabs_key,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
if not is_valid:
|
| 147 |
+
# Yield error message
|
| 148 |
+
yield "", f"⚠️ **Settings Required**\n\n{error_message}\n\nPlease complete your settings in the Settings tab before generating a podcast."
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
# If valid, run the actual generation
|
| 152 |
+
yield from run_agent(
|
| 153 |
+
url,
|
| 154 |
+
pdf_file,
|
| 155 |
+
advanced_mode,
|
| 156 |
+
multi_urls,
|
| 157 |
+
multi_pdfs,
|
| 158 |
+
user_llm_choice,
|
| 159 |
+
user_own_base_url,
|
| 160 |
+
user_own_api_key,
|
| 161 |
+
user_own_model,
|
| 162 |
+
user_openai_key,
|
| 163 |
+
user_openai_model,
|
| 164 |
+
user_tts_provider,
|
| 165 |
+
user_elevenlabs_key,
|
| 166 |
+
user_host_voice,
|
| 167 |
+
user_guest_voice,
|
| 168 |
+
user_podcast_length,
|
| 169 |
+
progress,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def run_agent(
|
| 174 |
+
url,
|
| 175 |
+
pdf_file,
|
| 176 |
+
advanced_mode,
|
| 177 |
+
multi_urls,
|
| 178 |
+
multi_pdfs,
|
| 179 |
+
user_llm_choice,
|
| 180 |
+
user_own_base_url,
|
| 181 |
+
user_own_api_key,
|
| 182 |
+
user_own_model,
|
| 183 |
+
user_openai_key,
|
| 184 |
+
user_openai_model,
|
| 185 |
+
user_tts_provider,
|
| 186 |
+
user_elevenlabs_key,
|
| 187 |
+
user_host_voice,
|
| 188 |
+
user_guest_voice,
|
| 189 |
+
user_podcast_length,
|
| 190 |
+
progress=gr.Progress(),
|
| 191 |
+
):
|
| 192 |
+
"""Run podcast generation with optional user settings"""
|
| 193 |
+
|
| 194 |
+
# Determine provider mode
|
| 195 |
+
if DEMO_MODE:
|
| 196 |
+
provider_mode = "demo"
|
| 197 |
+
elif user_llm_choice == "Own Inference":
|
| 198 |
+
provider_mode = "own_inference"
|
| 199 |
+
else: # OpenAI
|
| 200 |
+
provider_mode = "openai"
|
| 201 |
+
|
| 202 |
+
agent = PodcastAgent(
|
| 203 |
+
provider_mode=provider_mode,
|
| 204 |
+
own_base_url=user_own_base_url if user_own_base_url else None,
|
| 205 |
+
own_api_key=user_own_api_key if user_own_api_key else None,
|
| 206 |
+
own_model=user_own_model if user_own_model else None,
|
| 207 |
+
openai_key=user_openai_key if user_openai_key else None,
|
| 208 |
+
openai_model=user_openai_model if user_openai_model else None,
|
| 209 |
+
tts_provider=user_tts_provider if user_tts_provider else "edge-tts",
|
| 210 |
+
elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
|
| 211 |
+
host_voice=user_host_voice if user_host_voice else None,
|
| 212 |
+
guest_voice=user_guest_voice if user_guest_voice else None,
|
| 213 |
+
max_tokens=user_podcast_length if user_podcast_length else 4096,
|
| 214 |
+
)
|
| 215 |
+
logs_history = ""
|
| 216 |
+
|
| 217 |
+
# Log settings being used
|
| 218 |
+
settings_log = "Settings: "
|
| 219 |
+
if provider_mode == "demo":
|
| 220 |
+
settings_log += "LLM: Demo Inference | TTS: Edge-TTS (Microsoft) | "
|
| 221 |
+
elif provider_mode == "own_inference":
|
| 222 |
+
settings_log += f"LLM: Own Inference | "
|
| 223 |
+
if user_tts_provider == "edge-tts":
|
| 224 |
+
settings_log += "TTS: Edge-TTS (Microsoft) | "
|
| 225 |
+
elif user_elevenlabs_key:
|
| 226 |
+
settings_log += "TTS: Custom ElevenLabs | "
|
| 227 |
+
else:
|
| 228 |
+
settings_log += "TTS: ElevenLabs (no key provided) | "
|
| 229 |
+
else: # openai
|
| 230 |
+
settings_log += f"LLM: OpenAI ({user_openai_model or 'gpt-4o-mini'}) | "
|
| 231 |
+
if user_tts_provider == "edge-tts":
|
| 232 |
+
settings_log += "TTS: Edge-TTS (Microsoft) | "
|
| 233 |
+
elif user_elevenlabs_key:
|
| 234 |
+
settings_log += "TTS: Custom ElevenLabs | "
|
| 235 |
+
else:
|
| 236 |
+
settings_log += "TTS: ElevenLabs (no key provided) | "
|
| 237 |
+
|
| 238 |
+
settings_log += (
|
| 239 |
+
f"Length: {user_podcast_length if user_podcast_length else 4096} tokens"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Initial state
|
| 243 |
+
current_step = 0
|
| 244 |
+
yield "", f"Starting process...\n{settings_log}\n"
|
| 245 |
+
|
| 246 |
+
try:
|
| 247 |
+
# Advanced mode: multiple sources
|
| 248 |
+
if advanced_mode:
|
| 249 |
+
if multi_urls and multi_urls.strip():
|
| 250 |
+
# Multiple URLs
|
| 251 |
+
urls = [u.strip() for u in multi_urls.strip().split("\n") if u.strip()]
|
| 252 |
+
if not urls:
|
| 253 |
+
raise gr.Error("Please provide at least one paper URL")
|
| 254 |
+
|
| 255 |
+
current_step = 1
|
| 256 |
+
yield generate_progress_indicator(current_step), f"Processing {len(urls)} papers from URLs...\n"
|
| 257 |
+
|
| 258 |
+
# Process multiple URLs
|
| 259 |
+
for log_entry in agent.process_multiple(urls=urls):
|
| 260 |
+
if isinstance(log_entry, tuple):
|
| 261 |
+
audio_path, final_logs = log_entry
|
| 262 |
+
generate_transcript(audio_path, final_logs)
|
| 263 |
+
current_step = 5 # Completed
|
| 264 |
+
yield "", final_logs + f"\n\n✅ **Podcast Generated!**\n📁 Audio saved to: `{audio_path}`\n\n🎧 Check the **History** tab to listen."
|
| 265 |
+
else:
|
| 266 |
+
logs_history += log_entry + "\n"
|
| 267 |
+
# Update step based on log content
|
| 268 |
+
if "Extracted" in log_entry or "read_pdf" in log_entry:
|
| 269 |
+
current_step = 2
|
| 270 |
+
elif "generate_script" in log_entry or "Generated script" in log_entry:
|
| 271 |
+
current_step = 3
|
| 272 |
+
elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry:
|
| 273 |
+
current_step = 4
|
| 274 |
+
yield generate_progress_indicator(current_step), logs_history
|
| 275 |
+
|
| 276 |
+
elif multi_pdfs:
|
| 277 |
+
# Multiple PDFs
|
| 278 |
+
if not isinstance(multi_pdfs, list):
|
| 279 |
+
multi_pdfs = [multi_pdfs]
|
| 280 |
+
|
| 281 |
+
current_step = 2 # Skip fetching for PDFs
|
| 282 |
+
yield generate_progress_indicator(current_step), f"Processing {len(multi_pdfs)} PDF files...\n"
|
| 283 |
+
|
| 284 |
+
# Process multiple PDFs
|
| 285 |
+
for log_entry in agent.process_multiple(pdf_files=multi_pdfs):
|
| 286 |
+
if isinstance(log_entry, tuple):
|
| 287 |
+
audio_path, final_logs = log_entry
|
| 288 |
+
generate_transcript(audio_path, final_logs)
|
| 289 |
+
current_step = 5 # Completed
|
| 290 |
+
yield "", final_logs + f"\n\n✅ **Podcast Generated!**\n📁 Audio saved to: `{audio_path}`\n\n🎧 Check the **History** tab to listen."
|
| 291 |
+
else:
|
| 292 |
+
logs_history += log_entry + "\n"
|
| 293 |
+
# Update step based on log content
|
| 294 |
+
if "generate_script" in log_entry or "Generated script" in log_entry:
|
| 295 |
+
current_step = 3
|
| 296 |
+
elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry:
|
| 297 |
+
current_step = 4
|
| 298 |
+
yield generate_progress_indicator(current_step), logs_history
|
| 299 |
+
else:
|
| 300 |
+
raise gr.Error("Please provide multiple URLs or upload multiple PDFs")
|
| 301 |
+
|
| 302 |
+
# Simple mode: single source
|
| 303 |
+
else:
|
| 304 |
+
if not url and not pdf_file:
|
| 305 |
+
raise gr.Error("Please provide a paper URL or upload a PDF file")
|
| 306 |
+
|
| 307 |
+
# Determine starting step
|
| 308 |
+
if url:
|
| 309 |
+
current_step = 1 # Fetching
|
| 310 |
+
else:
|
| 311 |
+
current_step = 2 # Skip to extraction for uploaded PDF
|
| 312 |
+
|
| 313 |
+
for log_entry in agent.process(url=url if url else None, pdf_file=pdf_file):
|
| 314 |
+
if isinstance(log_entry, tuple):
|
| 315 |
+
audio_path, final_logs = log_entry
|
| 316 |
+
generate_transcript(audio_path, final_logs)
|
| 317 |
+
current_step = 5 # Completed
|
| 318 |
+
yield "", final_logs + f"\n\n✅ **Podcast Generated!**\n📁 Audio saved to: `{audio_path}`\n\n🎧 Check the **History** tab to listen."
|
| 319 |
+
else:
|
| 320 |
+
logs_history += log_entry + "\n"
|
| 321 |
+
# Update step based on log content
|
| 322 |
+
if "fetch_paper" in log_entry or "downloaded" in log_entry:
|
| 323 |
+
current_step = 1
|
| 324 |
+
elif "Extracted" in log_entry or "read_pdf" in log_entry:
|
| 325 |
+
current_step = 2
|
| 326 |
+
elif "generate_script" in log_entry or "Generated script" in log_entry:
|
| 327 |
+
current_step = 3
|
| 328 |
+
elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry:
|
| 329 |
+
current_step = 4
|
| 330 |
+
yield generate_progress_indicator(current_step), logs_history
|
| 331 |
+
|
| 332 |
+
except Exception as e:
|
| 333 |
+
yield "", f"Error: {str(e)}"
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def generate_transcript(audio_path, logs):
|
| 337 |
+
"""Generate transcript file"""
|
| 338 |
+
if not audio_path:
|
| 339 |
+
return None
|
| 340 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
| 341 |
+
transcript_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")
|
| 342 |
+
with open(transcript_path, "w") as f:
|
| 343 |
+
f.write("PAPERCAST TRANSCRIPT\n")
|
| 344 |
+
f.write("=" * 60 + "\n\n")
|
| 345 |
+
f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
| 346 |
+
f.write(logs)
|
| 347 |
+
return transcript_path
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def get_history_data():
|
| 351 |
+
"""Load history for dataframe"""
|
| 352 |
+
items = get_history_items()
|
| 353 |
+
if not items:
|
| 354 |
+
return []
|
| 355 |
+
|
| 356 |
+
data = []
|
| 357 |
+
for item in items:
|
| 358 |
+
data.append(
|
| 359 |
+
[
|
| 360 |
+
item.get("timestamp", "N/A"),
|
| 361 |
+
item.get("url", "Uploaded PDF") if item.get("url") else "Uploaded PDF",
|
| 362 |
+
item.get("audio_path", ""),
|
| 363 |
+
]
|
| 364 |
+
)
|
| 365 |
+
return data
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def on_history_select(evt: gr.SelectData, data):
|
| 369 |
+
"""Handle history table selection"""
|
| 370 |
+
try:
|
| 371 |
+
# data is the dataframe value. evt.index[0] is the row index
|
| 372 |
+
selected_row = data.iloc[evt.index[0]]
|
| 373 |
+
audio_path = selected_row.iloc[2] # 3rd column is audio_path
|
| 374 |
+
if os.path.exists(audio_path):
|
| 375 |
+
return audio_path
|
| 376 |
+
except:
|
| 377 |
+
pass
|
| 378 |
+
return None
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def main():
|
| 382 |
+
theme = gr.themes.Soft(
|
| 383 |
+
primary_hue="indigo",
|
| 384 |
+
secondary_hue="blue",
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
with gr.Blocks(title="PaperCast", theme=theme) as demo:
|
| 388 |
+
# Session state for settings
|
| 389 |
+
if DEMO_MODE:
|
| 390 |
+
user_llm_choice = gr.State(value="demo")
|
| 391 |
+
user_own_base_url = gr.State(value=DEMO_INFERENCE_URL)
|
| 392 |
+
user_own_api_key = gr.State(value=DEMO_INFERENCE_KEY)
|
| 393 |
+
user_own_model = gr.State(value=DEMO_MODEL)
|
| 394 |
+
user_openai_key = gr.State(value="")
|
| 395 |
+
user_openai_model = gr.State(value="")
|
| 396 |
+
user_tts_provider = gr.State(value="edge-tts")
|
| 397 |
+
user_elevenlabs_key = gr.State(value="")
|
| 398 |
+
user_host_voice = gr.State(value="en-US-GuyNeural")
|
| 399 |
+
user_guest_voice = gr.State(value="en-US-JennyNeural")
|
| 400 |
+
else:
|
| 401 |
+
user_llm_choice = gr.State(value="Own Inference")
|
| 402 |
+
user_own_base_url = gr.State(value="")
|
| 403 |
+
user_own_api_key = gr.State(value="")
|
| 404 |
+
user_own_model = gr.State(value="")
|
| 405 |
+
user_openai_key = gr.State(value="")
|
| 406 |
+
user_openai_model = gr.State(value="")
|
| 407 |
+
user_tts_provider = gr.State(value="edge-tts")
|
| 408 |
+
user_elevenlabs_key = gr.State(value="")
|
| 409 |
+
user_host_voice = gr.State(value="en-US-GuyNeural")
|
| 410 |
+
user_guest_voice = gr.State(value="en-US-JennyNeural")
|
| 411 |
+
user_podcast_length = gr.State(value=4096)
|
| 412 |
+
settings_valid = gr.State(value=DEMO_MODE) # Settings are valid in demo mode
|
| 413 |
+
|
| 414 |
+
# Initialize generate button state based on demo mode
|
| 415 |
+
generate_btn_state = gr.State(value=DEMO_MODE)
|
| 416 |
+
|
| 417 |
+
with gr.Row():
|
| 418 |
+
gr.HTML("""
|
| 419 |
+
<div style='text-align: center; padding: 35px 20px 25px 20px;'>
|
| 420 |
+
<h1 style='font-size: 3.5em; margin-bottom: 5px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold;'>
|
| 421 |
+
🎙️ PaperCast
|
| 422 |
+
</h1>
|
| 423 |
+
<p style='font-size: 1.5em; color: #444; margin-top: 12px; margin-bottom: 8px; font-weight: 400; line-height: 1.6;'>
|
| 424 |
+
Transform complex research papers into engaging podcast-style conversations
|
| 425 |
+
</p>
|
| 426 |
+
<p style='font-size: 1.1em; color: #888; margin-top: 0; font-weight: 300; font-style: italic;'>
|
| 427 |
+
AI-powered audio that makes science accessible, enjoyable, and easy to understand
|
| 428 |
+
</p>
|
| 429 |
+
</div>
|
| 430 |
+
""")
|
| 431 |
+
|
| 432 |
+
with gr.Tabs():
|
| 433 |
+
# ========== CREATE TAB ==========
|
| 434 |
+
with gr.Tab("🎙️ Generate Podcast"):
|
| 435 |
+
# Supported Platforms Banner (only in Create tab)
|
| 436 |
+
with gr.Row():
|
| 437 |
+
gr.HTML("""
|
| 438 |
+
<div style='text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; margin-bottom: 20px;'>
|
| 439 |
+
<h3 style='color: white; margin-bottom: 15px;'>✨ Supported Platforms</h3>
|
| 440 |
+
<div style='display: flex; justify-content: center; gap: 20px; flex-wrap: wrap;'>
|
| 441 |
+
<div style='background: rgba(255,255,255,0.95); padding: 15px 25px; border-radius: 10px; min-width: 200px;'>
|
| 442 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>📄</div>
|
| 443 |
+
<strong style='color: #667eea; font-size: 1.1em;'>arXiv</strong>
|
| 444 |
+
<p style='margin: 5px 0 0 0; font-size: 0.9em; color: #666;'>Physics, CS, AI & Math<br/>2M+ papers</p>
|
| 445 |
+
</div>
|
| 446 |
+
<div style='background: rgba(255,255,255,0.95); padding: 15px 25px; border-radius: 10px; min-width: 200px;'>
|
| 447 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>🏥</div>
|
| 448 |
+
<strong style='color: #667eea; font-size: 1.1em;'>medRxiv</strong>
|
| 449 |
+
<p style='margin: 5px 0 0 0; font-size: 0.9em; color: #666;'>Medical & Health Sciences<br/>Latest research</p>
|
| 450 |
+
</div>
|
| 451 |
+
<div style='background: rgba(255,255,255,0.95); padding: 15px 25px; border-radius: 10px; min-width: 200px;'>
|
| 452 |
+
<div style='font-size: 2em; margin-bottom: 5px;'>📎</div>
|
| 453 |
+
<strong style='color: #667eea; font-size: 1.1em;'>Any PDF</strong>
|
| 454 |
+
<p style='margin: 5px 0 0 0; font-size: 0.9em; color: #666;'>Direct Upload<br/>Any research paper</p>
|
| 455 |
+
</div>
|
| 456 |
+
</div>
|
| 457 |
+
</div>
|
| 458 |
+
""")
|
| 459 |
+
|
| 460 |
+
# Example Papers Section
|
| 461 |
+
with gr.Row():
|
| 462 |
+
with gr.Column():
|
| 463 |
+
gr.Markdown("### 📚 Example Papers")
|
| 464 |
+
gr.Markdown("*Click any example to auto-fill the URL field and try it out!*")
|
| 465 |
+
|
| 466 |
+
with gr.Row():
|
| 467 |
+
with gr.Column(scale=1):
|
| 468 |
+
example_btn1 = gr.Button(
|
| 469 |
+
"🤖 Attention Is All You Need\n\nThe foundational Transformer paper",
|
| 470 |
+
size="sm",
|
| 471 |
+
variant="secondary"
|
| 472 |
+
)
|
| 473 |
+
with gr.Column(scale=1):
|
| 474 |
+
example_btn2 = gr.Button(
|
| 475 |
+
"🧠 GPT-4 Technical Report\n\nOpenAI's GPT-4 capabilities",
|
| 476 |
+
size="sm",
|
| 477 |
+
variant="secondary"
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
with gr.Row():
|
| 481 |
+
with gr.Column(scale=1):
|
| 482 |
+
example_btn3 = gr.Button(
|
| 483 |
+
"🎨 DALL-E 2 Image Generation\n\nDiffusion models for images",
|
| 484 |
+
size="sm",
|
| 485 |
+
variant="secondary"
|
| 486 |
+
)
|
| 487 |
+
with gr.Column(scale=1):
|
| 488 |
+
example_btn4 = gr.Button(
|
| 489 |
+
"🔬 Deep Residual Learning\n\nRevolutionary ResNet architecture",
|
| 490 |
+
size="sm",
|
| 491 |
+
variant="secondary"
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
with gr.Row():
|
| 495 |
+
with gr.Column(scale=1):
|
| 496 |
+
gr.Markdown("### 1. Source Material")
|
| 497 |
+
|
| 498 |
+
# Simple Mode Inputs (default)
|
| 499 |
+
with gr.Group(visible=True) as simple_inputs:
|
| 500 |
+
simple_source_type = gr.Radio(
|
| 501 |
+
choices=["Paper URL", "Upload PDF"],
|
| 502 |
+
value="Paper URL",
|
| 503 |
+
label="Choose Source Type",
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
url_input = gr.Textbox(
|
| 507 |
+
label="Paper URL (arXiv, medRxiv)",
|
| 508 |
+
placeholder="https://arxiv.org/abs/...",
|
| 509 |
+
visible=True,
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
pdf_upload = gr.File(
|
| 513 |
+
label="Upload PDF", file_types=[".pdf"], visible=False
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
# Advanced Mode Inputs (hidden by default)
|
| 517 |
+
with gr.Group(visible=False) as advanced_inputs:
|
| 518 |
+
source_type = gr.Radio(
|
| 519 |
+
choices=["Multiple URLs", "Multiple PDFs"],
|
| 520 |
+
value="Multiple URLs",
|
| 521 |
+
label="Choose Source Type",
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
multi_url_input = gr.Textbox(
|
| 525 |
+
label="Paper URLs (one per line)",
|
| 526 |
+
placeholder="https://arxiv.org/abs/2301.12345\nhttps://arxiv.org/abs/2302.67890\nhttps://www.medrxiv.org/content/...",
|
| 527 |
+
lines=5,
|
| 528 |
+
visible=True,
|
| 529 |
+
)
|
| 530 |
+
|
| 531 |
+
multi_pdf_upload = gr.File(
|
| 532 |
+
label="Upload Multiple PDFs",
|
| 533 |
+
file_types=[".pdf"],
|
| 534 |
+
file_count="multiple",
|
| 535 |
+
visible=False,
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
generate_btn = gr.Button(
|
| 539 |
+
"Generate Podcast",
|
| 540 |
+
variant="primary",
|
| 541 |
+
size="lg",
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
# Advanced Mode Toggle (below button)
|
| 545 |
+
advanced_mode = gr.Checkbox(
|
| 546 |
+
label="🚀 Advanced Mode (Multiple Papers)",
|
| 547 |
+
value=False,
|
| 548 |
+
info="Enable to process multiple papers at once",
|
| 549 |
+
)
|
| 550 |
+
|
| 551 |
+
# Warning message for advanced mode
|
| 552 |
+
advanced_warning = gr.Markdown(
|
| 553 |
+
"""
|
| 554 |
+
> ⚠️ **Experimental Feature Warning**
|
| 555 |
+
>
|
| 556 |
+
> This method processes multiple papers and uses extensive context.
|
| 557 |
+
> Accuracy cannot be guaranteed and results may be inconsistent.
|
| 558 |
+
> **Not recommended for production use.**
|
| 559 |
+
""",
|
| 560 |
+
visible=False,
|
| 561 |
+
elem_id="advanced-warning",
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
# Toggle visibility based on advanced mode
|
| 565 |
+
def toggle_mode(is_advanced):
|
| 566 |
+
return {
|
| 567 |
+
simple_inputs: gr.update(visible=not is_advanced),
|
| 568 |
+
advanced_inputs: gr.update(visible=is_advanced),
|
| 569 |
+
advanced_warning: gr.update(visible=is_advanced),
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
advanced_mode.change(
|
| 573 |
+
fn=toggle_mode,
|
| 574 |
+
inputs=[advanced_mode],
|
| 575 |
+
outputs=[simple_inputs, advanced_inputs, advanced_warning],
|
| 576 |
+
)
|
| 577 |
+
|
| 578 |
+
# Toggle between URL and PDF in simple mode
|
| 579 |
+
def toggle_simple_source(source):
|
| 580 |
+
if source == "Paper URL":
|
| 581 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 582 |
+
else:
|
| 583 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 584 |
+
|
| 585 |
+
simple_source_type.change(
|
| 586 |
+
fn=toggle_simple_source,
|
| 587 |
+
inputs=[simple_source_type],
|
| 588 |
+
outputs=[url_input, pdf_upload],
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
# Toggle between URLs and PDFs in advanced mode
|
| 592 |
+
def toggle_advanced_source(source):
|
| 593 |
+
if source == "Multiple URLs":
|
| 594 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 595 |
+
else:
|
| 596 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 597 |
+
|
| 598 |
+
source_type.change(
|
| 599 |
+
fn=toggle_advanced_source,
|
| 600 |
+
inputs=[source_type],
|
| 601 |
+
outputs=[multi_url_input, multi_pdf_upload],
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
# Example paper button handlers
|
| 605 |
+
example_btn1.click(
|
| 606 |
+
fn=lambda: "https://arxiv.org/abs/1706.03762",
|
| 607 |
+
outputs=[url_input],
|
| 608 |
+
)
|
| 609 |
+
example_btn2.click(
|
| 610 |
+
fn=lambda: "https://arxiv.org/abs/2303.08774",
|
| 611 |
+
outputs=[url_input],
|
| 612 |
+
)
|
| 613 |
+
example_btn3.click(
|
| 614 |
+
fn=lambda: "https://arxiv.org/abs/2204.06125",
|
| 615 |
+
outputs=[url_input],
|
| 616 |
+
)
|
| 617 |
+
example_btn4.click(
|
| 618 |
+
fn=lambda: "https://arxiv.org/abs/1512.03385",
|
| 619 |
+
outputs=[url_input],
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
with gr.Column(scale=1):
|
| 623 |
+
gr.Markdown("### 2. Status & Output")
|
| 624 |
+
|
| 625 |
+
# Progress Indicator
|
| 626 |
+
progress_status = gr.Markdown(
|
| 627 |
+
value="",
|
| 628 |
+
label="Progress",
|
| 629 |
+
visible=False,
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
status_output = gr.Code(
|
| 633 |
+
label="Process Log",
|
| 634 |
+
language="markdown",
|
| 635 |
+
interactive=False,
|
| 636 |
+
lines=15,
|
| 637 |
+
)
|
| 638 |
+
|
| 639 |
+
generate_btn.click(
|
| 640 |
+
fn=validated_generate_agent,
|
| 641 |
+
inputs=[
|
| 642 |
+
url_input,
|
| 643 |
+
pdf_upload,
|
| 644 |
+
advanced_mode,
|
| 645 |
+
multi_url_input,
|
| 646 |
+
multi_pdf_upload,
|
| 647 |
+
user_llm_choice,
|
| 648 |
+
user_own_base_url,
|
| 649 |
+
user_own_api_key,
|
| 650 |
+
user_own_model,
|
| 651 |
+
user_openai_key,
|
| 652 |
+
user_openai_model,
|
| 653 |
+
user_tts_provider,
|
| 654 |
+
user_elevenlabs_key,
|
| 655 |
+
user_host_voice,
|
| 656 |
+
user_guest_voice,
|
| 657 |
+
user_podcast_length,
|
| 658 |
+
],
|
| 659 |
+
outputs=[progress_status, status_output],
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
# ========== HISTORY TAB ==========
|
| 663 |
+
with gr.Tab("📚 History"):
|
| 664 |
+
gr.Markdown("### Past Podcasts")
|
| 665 |
+
with gr.Row():
|
| 666 |
+
refresh_btn = gr.Button("Refresh History", size="sm")
|
| 667 |
+
|
| 668 |
+
history_table = gr.Dataframe(
|
| 669 |
+
headers=["Date", "Source", "Audio Path"],
|
| 670 |
+
datatype=["str", "str", "str"],
|
| 671 |
+
value=get_history_data(),
|
| 672 |
+
interactive=False,
|
| 673 |
+
label="Click a row to play",
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
history_player = gr.Audio(label="Playback", type="filepath")
|
| 677 |
+
|
| 678 |
+
def refresh_history():
|
| 679 |
+
return get_history_data()
|
| 680 |
+
|
| 681 |
+
refresh_btn.click(fn=refresh_history, outputs=[history_table])
|
| 682 |
+
|
| 683 |
+
# Handle selection
|
| 684 |
+
history_table.select(
|
| 685 |
+
fn=on_history_select,
|
| 686 |
+
inputs=[history_table],
|
| 687 |
+
outputs=[history_player],
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
# ========== TRANSCRIPTS TAB ==========
|
| 691 |
+
with gr.Tab("📝 Transcripts"):
|
| 692 |
+
gr.Markdown("### Transcript Viewer")
|
| 693 |
+
gr.Markdown(
|
| 694 |
+
"*Coming soon: View and download transcripts from history.*"
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
# ========== SETTINGS TAB ==========
|
| 698 |
+
with gr.Tab("⚙️ Settings"):
|
| 699 |
+
with gr.Row():
|
| 700 |
+
with gr.Column(scale=1):
|
| 701 |
+
pass
|
| 702 |
+
with gr.Column(scale=3):
|
| 703 |
+
gr.Markdown("""
|
| 704 |
+
<div style="text-align: center;">
|
| 705 |
+
|
| 706 |
+
# ⚙️ Settings
|
| 707 |
+
|
| 708 |
+
Configure your PaperCast experience with your own API keys and preferences.
|
| 709 |
+
|
| 710 |
+
---
|
| 711 |
+
|
| 712 |
+
</div>
|
| 713 |
+
""")
|
| 714 |
+
|
| 715 |
+
gr.Markdown("## 🤖 LLM Selection")
|
| 716 |
+
gr.Markdown(
|
| 717 |
+
"Choose which language model provider to use for script generation."
|
| 718 |
+
)
|
| 719 |
+
|
| 720 |
+
with gr.Group():
|
| 721 |
+
if DEMO_MODE:
|
| 722 |
+
gr.Markdown(
|
| 723 |
+
"**🔧 Demo Mode Active** - Using built-in inference and TTS services"
|
| 724 |
+
)
|
| 725 |
+
else:
|
| 726 |
+
llm_choice = gr.Radio(
|
| 727 |
+
choices=[
|
| 728 |
+
"Own Inference",
|
| 729 |
+
"OpenAI",
|
| 730 |
+
],
|
| 731 |
+
value="Own Inference",
|
| 732 |
+
label="Language Model Provider",
|
| 733 |
+
info="Choose your language model provider for script generation",
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
# Own Inference inputs (base URL + API key)
|
| 737 |
+
own_inference_base_url = gr.Textbox(
|
| 738 |
+
label="Base URL",
|
| 739 |
+
placeholder="https://your-server.com/v1",
|
| 740 |
+
info="OpenAI-compatible endpoint",
|
| 741 |
+
visible=not DEMO_MODE,
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
own_inference_api_key = gr.Textbox(
|
| 745 |
+
label="API Key",
|
| 746 |
+
placeholder="Optional - leave empty if not required",
|
| 747 |
+
type="password",
|
| 748 |
+
info="API key for your inference server (if required)",
|
| 749 |
+
visible=not DEMO_MODE,
|
| 750 |
+
)
|
| 751 |
+
|
| 752 |
+
own_inference_model = gr.Textbox(
|
| 753 |
+
label="Model Name",
|
| 754 |
+
placeholder="e.g., llama-3.1-8b, mistral-7b",
|
| 755 |
+
info="Model name on your server",
|
| 756 |
+
visible=not DEMO_MODE,
|
| 757 |
+
)
|
| 758 |
+
|
| 759 |
+
# OpenAI inputs
|
| 760 |
+
openai_key_input = gr.Textbox(
|
| 761 |
+
label="OpenAI API Key",
|
| 762 |
+
placeholder="sk-...",
|
| 763 |
+
type="password",
|
| 764 |
+
info="Required when using OpenAI",
|
| 765 |
+
visible=False, # Hidden by default, shown only when OpenAI is selected
|
| 766 |
+
)
|
| 767 |
+
|
| 768 |
+
openai_model_input = gr.Textbox(
|
| 769 |
+
label="OpenAI Model Name",
|
| 770 |
+
placeholder="gpt-4o-mini",
|
| 771 |
+
value="gpt-4o-mini",
|
| 772 |
+
info="Model name (e.g., gpt-4o-mini, gpt-4, gpt-3.5-turbo)",
|
| 773 |
+
visible=False, # Hidden by default, shown only when OpenAI is selected
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
gr.Markdown("---")
|
| 777 |
+
|
| 778 |
+
gr.Markdown("## 🔊 Text-to-Speech (TTS)")
|
| 779 |
+
if DEMO_MODE:
|
| 780 |
+
gr.Markdown(
|
| 781 |
+
"**🔧 Demo Mode Active** - Using Edge-TTS (Microsoft, free)"
|
| 782 |
+
)
|
| 783 |
+
else:
|
| 784 |
+
gr.Markdown(
|
| 785 |
+
"Choose your TTS provider for audio generation"
|
| 786 |
+
)
|
| 787 |
+
|
| 788 |
+
with gr.Group():
|
| 789 |
+
tts_provider_choice = gr.Radio(
|
| 790 |
+
choices=[
|
| 791 |
+
"Edge-TTS (Free, Microsoft)",
|
| 792 |
+
"ElevenLabs (Paid, Better Quality)",
|
| 793 |
+
],
|
| 794 |
+
value="Edge-TTS (Free, Microsoft)",
|
| 795 |
+
label="TTS Provider",
|
| 796 |
+
info="Edge-TTS is free and works without API key. ElevenLabs offers better voice quality.",
|
| 797 |
+
visible=not DEMO_MODE,
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
elevenlabs_key_input = gr.Textbox(
|
| 801 |
+
label="ElevenLabs API Key",
|
| 802 |
+
placeholder="sk_... (required for ElevenLabs)",
|
| 803 |
+
type="password",
|
| 804 |
+
info="Get your key at: elevenlabs.io",
|
| 805 |
+
visible=False, # Hidden by default since Edge-TTS is default
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
gr.Markdown("### 🎭 Voice Selection")
|
| 809 |
+
if DEMO_MODE:
|
| 810 |
+
gr.Markdown("*Choose voices for your podcast (Demo mode uses Edge-TTS)*")
|
| 811 |
+
|
| 812 |
+
# Edge-TTS voice selections
|
| 813 |
+
with gr.Group(visible=True if DEMO_MODE else not DEMO_MODE) as edge_voice_group:
|
| 814 |
+
edge_host_voice = gr.Dropdown(
|
| 815 |
+
choices=list(EDGE_TTS_VOICES.keys()),
|
| 816 |
+
value="Guy (US Male - Casual)",
|
| 817 |
+
label="Host Voice (Edge-TTS)",
|
| 818 |
+
info="Select voice for the podcast host",
|
| 819 |
+
)
|
| 820 |
+
edge_guest_voice = gr.Dropdown(
|
| 821 |
+
choices=list(EDGE_TTS_VOICES.keys()),
|
| 822 |
+
value="Jenny (US Female - Friendly)",
|
| 823 |
+
label="Guest Voice (Edge-TTS)",
|
| 824 |
+
info="Select voice for the expert guest",
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
+
# ElevenLabs voice selections (hidden by default, hidden in demo mode)
|
| 828 |
+
if not DEMO_MODE:
|
| 829 |
+
with gr.Group(visible=False) as elevenlabs_voice_group:
|
| 830 |
+
elevenlabs_host_voice = gr.Dropdown(
|
| 831 |
+
choices=list(ELEVENLABS_VOICES.keys()),
|
| 832 |
+
value="Antoni (Male - Well-rounded)",
|
| 833 |
+
label="Host Voice (ElevenLabs)",
|
| 834 |
+
info="Select voice for the podcast host",
|
| 835 |
+
)
|
| 836 |
+
elevenlabs_guest_voice = gr.Dropdown(
|
| 837 |
+
choices=list(ELEVENLABS_VOICES.keys()),
|
| 838 |
+
value="Bella (Female - Soft)",
|
| 839 |
+
label="Guest Voice (ElevenLabs)",
|
| 840 |
+
info="Select voice for the expert guest",
|
| 841 |
+
)
|
| 842 |
+
else:
|
| 843 |
+
# Create dummy components for demo mode so we can reference them
|
| 844 |
+
elevenlabs_voice_group = None
|
| 845 |
+
elevenlabs_host_voice = gr.State(value="Antoni (Male - Well-rounded)")
|
| 846 |
+
elevenlabs_guest_voice = gr.State(value="Bella (Female - Soft)")
|
| 847 |
+
|
| 848 |
+
# Toggle visibility based on LLM choice (only when not in demo mode)
|
| 849 |
+
if not DEMO_MODE:
|
| 850 |
+
|
| 851 |
+
def toggle_llm_inputs(choice):
|
| 852 |
+
if choice == "Own Inference":
|
| 853 |
+
return {
|
| 854 |
+
own_inference_base_url: gr.update(visible=True),
|
| 855 |
+
own_inference_api_key: gr.update(visible=True),
|
| 856 |
+
own_inference_model: gr.update(visible=True),
|
| 857 |
+
openai_key_input: gr.update(visible=False),
|
| 858 |
+
openai_model_input: gr.update(visible=False),
|
| 859 |
+
}
|
| 860 |
+
elif choice == "OpenAI":
|
| 861 |
+
return {
|
| 862 |
+
own_inference_base_url: gr.update(
|
| 863 |
+
visible=False
|
| 864 |
+
),
|
| 865 |
+
own_inference_api_key: gr.update(visible=False),
|
| 866 |
+
own_inference_model: gr.update(visible=False),
|
| 867 |
+
openai_key_input: gr.update(visible=True),
|
| 868 |
+
openai_model_input: gr.update(visible=True),
|
| 869 |
+
}
|
| 870 |
+
|
| 871 |
+
llm_choice.change(
|
| 872 |
+
fn=toggle_llm_inputs,
|
| 873 |
+
inputs=[llm_choice],
|
| 874 |
+
outputs=[
|
| 875 |
+
own_inference_base_url,
|
| 876 |
+
own_inference_api_key,
|
| 877 |
+
own_inference_model,
|
| 878 |
+
openai_key_input,
|
| 879 |
+
openai_model_input,
|
| 880 |
+
],
|
| 881 |
+
)
|
| 882 |
+
|
| 883 |
+
# Toggle visibility based on TTS provider choice
|
| 884 |
+
def toggle_tts_inputs(choice):
|
| 885 |
+
if choice == "Edge-TTS (Free, Microsoft)":
|
| 886 |
+
return {
|
| 887 |
+
elevenlabs_key_input: gr.update(visible=False),
|
| 888 |
+
edge_voice_group: gr.update(visible=True),
|
| 889 |
+
elevenlabs_voice_group: gr.update(visible=False),
|
| 890 |
+
}
|
| 891 |
+
else: # ElevenLabs
|
| 892 |
+
return {
|
| 893 |
+
elevenlabs_key_input: gr.update(visible=True),
|
| 894 |
+
edge_voice_group: gr.update(visible=False),
|
| 895 |
+
elevenlabs_voice_group: gr.update(visible=True),
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
tts_provider_choice.change(
|
| 899 |
+
fn=toggle_tts_inputs,
|
| 900 |
+
inputs=[tts_provider_choice],
|
| 901 |
+
outputs=[elevenlabs_key_input, edge_voice_group, elevenlabs_voice_group],
|
| 902 |
+
)
|
| 903 |
+
|
| 904 |
+
gr.Markdown("---")
|
| 905 |
+
|
| 906 |
+
gr.Markdown("## 🎚️ Podcast Settings")
|
| 907 |
+
|
| 908 |
+
with gr.Group():
|
| 909 |
+
podcast_length = gr.Slider(
|
| 910 |
+
minimum=1000,
|
| 911 |
+
maximum=8000,
|
| 912 |
+
value=4096,
|
| 913 |
+
step=500,
|
| 914 |
+
label="Podcast Length (Max Tokens)",
|
| 915 |
+
info="Higher values = longer podcasts",
|
| 916 |
+
)
|
| 917 |
+
|
| 918 |
+
gr.Markdown("---")
|
| 919 |
+
|
| 920 |
+
save_settings_btn = gr.Button(
|
| 921 |
+
"💾 Save Settings", variant="primary", size="lg"
|
| 922 |
+
)
|
| 923 |
+
settings_status = gr.Markdown("")
|
| 924 |
+
|
| 925 |
+
def save_settings(
|
| 926 |
+
llm_choice,
|
| 927 |
+
own_base_url,
|
| 928 |
+
own_api_key,
|
| 929 |
+
own_model,
|
| 930 |
+
openai_key,
|
| 931 |
+
openai_model,
|
| 932 |
+
tts_provider,
|
| 933 |
+
elevenlabs_key,
|
| 934 |
+
edge_host,
|
| 935 |
+
edge_guest,
|
| 936 |
+
elevenlabs_host,
|
| 937 |
+
elevenlabs_guest,
|
| 938 |
+
length,
|
| 939 |
+
):
|
| 940 |
+
status = "✅ **Settings Saved!**\n\n"
|
| 941 |
+
|
| 942 |
+
# Convert TTS provider choice to internal format
|
| 943 |
+
if tts_provider == "Edge-TTS (Free, Microsoft)":
|
| 944 |
+
tts_provider_internal = "edge-tts"
|
| 945 |
+
else:
|
| 946 |
+
tts_provider_internal = "elevenlabs"
|
| 947 |
+
|
| 948 |
+
# Validate settings first (only in non-demo mode)
|
| 949 |
+
is_valid, validation_message = (
|
| 950 |
+
validate_settings_for_generation(
|
| 951 |
+
llm_choice,
|
| 952 |
+
own_base_url,
|
| 953 |
+
own_api_key,
|
| 954 |
+
openai_key,
|
| 955 |
+
tts_provider_internal,
|
| 956 |
+
elevenlabs_key,
|
| 957 |
+
)
|
| 958 |
+
)
|
| 959 |
+
|
| 960 |
+
# LLM Settings
|
| 961 |
+
if DEMO_MODE:
|
| 962 |
+
status += "- LLM: Demo Inference ✓\n"
|
| 963 |
+
elif llm_choice == "Own Inference":
|
| 964 |
+
if own_base_url:
|
| 965 |
+
status += f"- LLM: Own Inference ✓\n"
|
| 966 |
+
status += f" - URL: {own_base_url[:50]}...\n"
|
| 967 |
+
status += f" - Model: {own_model or 'Default'}\n"
|
| 968 |
+
else:
|
| 969 |
+
status += "- ⚠️ LLM: Own Inference selected but no base URL provided\n"
|
| 970 |
+
elif llm_choice == "OpenAI":
|
| 971 |
+
if openai_key:
|
| 972 |
+
status += f"- LLM: OpenAI ({openai_model or 'gpt-4o-mini'}) ✓\n"
|
| 973 |
+
else:
|
| 974 |
+
status += "- ⚠️ LLM: OpenAI selected but no API key provided\n"
|
| 975 |
+
|
| 976 |
+
# TTS Settings
|
| 977 |
+
if DEMO_MODE:
|
| 978 |
+
status += "- TTS: Edge-TTS (Microsoft, free) ✓\n"
|
| 979 |
+
else:
|
| 980 |
+
if tts_provider_internal == "edge-tts":
|
| 981 |
+
status += "- TTS: Edge-TTS (Microsoft, free) ✓\n"
|
| 982 |
+
elif elevenlabs_key:
|
| 983 |
+
status += "- TTS: ElevenLabs (Custom key) ✓\n"
|
| 984 |
+
else:
|
| 985 |
+
status += "- ⚠️ TTS: ElevenLabs key required\n"
|
| 986 |
+
|
| 987 |
+
# Add validation result
|
| 988 |
+
if not DEMO_MODE:
|
| 989 |
+
if is_valid:
|
| 990 |
+
status += "\n✅ **All settings are valid!**\n"
|
| 991 |
+
status += "🎉 Generate button is now enabled.\n"
|
| 992 |
+
else:
|
| 993 |
+
status += "\n⚠️ **Settings incomplete!**\n"
|
| 994 |
+
status += "🚫 Generate button remains disabled.\n"
|
| 995 |
+
status += f"\nRequired fixes:\n{validation_message}"
|
| 996 |
+
|
| 997 |
+
status += f"\n- Podcast Length: {int(length)} tokens\n"
|
| 998 |
+
status += (
|
| 999 |
+
"\n*Settings will be used for next podcast generation.*"
|
| 1000 |
+
)
|
| 1001 |
+
|
| 1002 |
+
# Determine which voices to use based on TTS provider
|
| 1003 |
+
if tts_provider_internal == "edge-tts":
|
| 1004 |
+
host_voice = EDGE_TTS_VOICES.get(edge_host, "en-US-GuyNeural")
|
| 1005 |
+
guest_voice = EDGE_TTS_VOICES.get(edge_guest, "en-US-JennyNeural")
|
| 1006 |
+
else: # elevenlabs
|
| 1007 |
+
host_voice = ELEVENLABS_VOICES.get(elevenlabs_host, "ErXwobaYiN019PkySvjV")
|
| 1008 |
+
guest_voice = ELEVENLABS_VOICES.get(elevenlabs_guest, "EXAVITQu4vr4xnSDxMaL")
|
| 1009 |
+
|
| 1010 |
+
return (
|
| 1011 |
+
status,
|
| 1012 |
+
llm_choice if not DEMO_MODE else "demo",
|
| 1013 |
+
own_base_url if not DEMO_MODE else DEMO_INFERENCE_URL,
|
| 1014 |
+
own_api_key if not DEMO_MODE else DEMO_INFERENCE_KEY,
|
| 1015 |
+
own_model if not DEMO_MODE else DEMO_MODEL,
|
| 1016 |
+
openai_key,
|
| 1017 |
+
openai_model,
|
| 1018 |
+
tts_provider_internal if not DEMO_MODE else "edge-tts",
|
| 1019 |
+
elevenlabs_key if not DEMO_MODE else "",
|
| 1020 |
+
host_voice if not DEMO_MODE else "en-US-GuyNeural",
|
| 1021 |
+
guest_voice if not DEMO_MODE else "en-US-JennyNeural",
|
| 1022 |
+
int(length),
|
| 1023 |
+
is_valid,
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
if DEMO_MODE:
|
| 1027 |
+
# In demo mode, settings are pre-configured but voices can be customized
|
| 1028 |
+
def save_demo_settings(edge_host, edge_guest, length):
|
| 1029 |
+
host_voice = EDGE_TTS_VOICES.get(edge_host, "en-US-GuyNeural")
|
| 1030 |
+
guest_voice = EDGE_TTS_VOICES.get(edge_guest, "en-US-JennyNeural")
|
| 1031 |
+
|
| 1032 |
+
return (
|
| 1033 |
+
f"✅ **Settings Saved!**\n\n- LLM: Demo Inference ✓\n- TTS: Edge-TTS (Microsoft, free) ✓\n- Host Voice: {edge_host}\n- Guest Voice: {edge_guest}\n\n*Demo mode is active with built-in services.*",
|
| 1034 |
+
"demo",
|
| 1035 |
+
DEMO_INFERENCE_URL,
|
| 1036 |
+
DEMO_INFERENCE_KEY,
|
| 1037 |
+
DEMO_MODEL,
|
| 1038 |
+
"",
|
| 1039 |
+
"",
|
| 1040 |
+
"edge-tts",
|
| 1041 |
+
"",
|
| 1042 |
+
host_voice,
|
| 1043 |
+
guest_voice,
|
| 1044 |
+
int(length),
|
| 1045 |
+
True, # settings_valid = True in demo mode
|
| 1046 |
+
)
|
| 1047 |
+
|
| 1048 |
+
save_settings_btn.click(
|
| 1049 |
+
fn=save_demo_settings,
|
| 1050 |
+
inputs=[edge_host_voice, edge_guest_voice, podcast_length],
|
| 1051 |
+
outputs=[
|
| 1052 |
+
settings_status,
|
| 1053 |
+
user_llm_choice,
|
| 1054 |
+
user_own_base_url,
|
| 1055 |
+
user_own_api_key,
|
| 1056 |
+
user_own_model,
|
| 1057 |
+
user_openai_key,
|
| 1058 |
+
user_openai_model,
|
| 1059 |
+
user_tts_provider,
|
| 1060 |
+
user_elevenlabs_key,
|
| 1061 |
+
user_host_voice,
|
| 1062 |
+
user_guest_voice,
|
| 1063 |
+
user_podcast_length,
|
| 1064 |
+
settings_valid,
|
| 1065 |
+
],
|
| 1066 |
+
)
|
| 1067 |
+
else:
|
| 1068 |
+
save_settings_btn.click(
|
| 1069 |
+
fn=save_settings,
|
| 1070 |
+
inputs=[
|
| 1071 |
+
llm_choice,
|
| 1072 |
+
own_inference_base_url,
|
| 1073 |
+
own_inference_api_key,
|
| 1074 |
+
own_inference_model,
|
| 1075 |
+
openai_key_input,
|
| 1076 |
+
openai_model_input,
|
| 1077 |
+
tts_provider_choice,
|
| 1078 |
+
elevenlabs_key_input,
|
| 1079 |
+
edge_host_voice,
|
| 1080 |
+
edge_guest_voice,
|
| 1081 |
+
elevenlabs_host_voice,
|
| 1082 |
+
elevenlabs_guest_voice,
|
| 1083 |
+
podcast_length,
|
| 1084 |
+
],
|
| 1085 |
+
outputs=[
|
| 1086 |
+
settings_status,
|
| 1087 |
+
user_llm_choice,
|
| 1088 |
+
user_own_base_url,
|
| 1089 |
+
user_own_api_key,
|
| 1090 |
+
user_own_model,
|
| 1091 |
+
user_openai_key,
|
| 1092 |
+
user_openai_model,
|
| 1093 |
+
user_tts_provider,
|
| 1094 |
+
user_elevenlabs_key,
|
| 1095 |
+
user_host_voice,
|
| 1096 |
+
user_guest_voice,
|
| 1097 |
+
user_podcast_length,
|
| 1098 |
+
settings_valid,
|
| 1099 |
+
],
|
| 1100 |
+
)
|
| 1101 |
+
|
| 1102 |
+
with gr.Column(scale=1):
|
| 1103 |
+
pass
|
| 1104 |
+
|
| 1105 |
+
# ========== ABOUT TAB ==========
|
| 1106 |
+
with gr.Tab("ℹ️ About"):
|
| 1107 |
+
with gr.Row():
|
| 1108 |
+
with gr.Column(scale=1):
|
| 1109 |
+
pass
|
| 1110 |
+
with gr.Column(scale=3):
|
| 1111 |
+
gr.Markdown(f"""
|
| 1112 |
+
<div style="text-align: center;">
|
| 1113 |
+
|
| 1114 |
+
# About PaperCast
|
| 1115 |
+
|
| 1116 |
+
**PaperCast** is an AI-powered application that transforms complex research papers into engaging, accessible audio podcasts.
|
| 1117 |
+
Making scientific knowledge more accessible, one paper at a time.
|
| 1118 |
+
|
| 1119 |
+
---
|
| 1120 |
+
|
| 1121 |
+
## 🎯 How It Works
|
| 1122 |
+
|
| 1123 |
+
Our intelligent agent orchestrates a multi-step pipeline to create your podcast:
|
| 1124 |
+
|
| 1125 |
+
1. **📥 Input** - Provide a paper URL (arXiv, medRxiv) or upload any PDF
|
| 1126 |
+
2. **📄 Extraction** - AI extracts and analyzes the paper content
|
| 1127 |
+
3. **🎬 Script Generation** - Creates natural dialogue between Host and Expert personas
|
| 1128 |
+
4. **🎤 Voice Synthesis** - Generates high-quality audio with distinct voices
|
| 1129 |
+
5. **✅ Delivery** - Your podcast is ready to listen and download
|
| 1130 |
+
|
| 1131 |
+
---
|
| 1132 |
+
|
| 1133 |
+
## 🌟 Key Features
|
| 1134 |
+
|
| 1135 |
+
**Multiple Sources**: Support for arXiv, medRxiv, and direct PDF uploads
|
| 1136 |
+
**Natural Dialogue**: Engaging conversation between Host and Expert characters
|
| 1137 |
+
**High-Quality Audio**: Professional voice synthesis powered by ElevenLabs
|
| 1138 |
+
**Smart Processing**: AI understands paper structure and creates contextual discussions
|
| 1139 |
+
**History Tracking**: Keep track of all your generated podcasts
|
| 1140 |
+
|
| 1141 |
+
---
|
| 1142 |
+
|
| 1143 |
+
## 🔧 Technology Stack
|
| 1144 |
+
|
| 1145 |
+
**LLM**: {SCRIPT_GENERATION_MODEL}
|
| 1146 |
+
**TTS**: Edge-TTS (Microsoft, Free) / ElevenLabs API (Optional)
|
| 1147 |
+
**Infrastructure**: ☁️ Remote Inference
|
| 1148 |
+
**Framework**: Gradio 6
|
| 1149 |
+
**PDF Processing**: PyMuPDF
|
| 1150 |
+
|
| 1151 |
+
---
|
| 1152 |
+
|
| 1153 |
+
## 🎓 Built For
|
| 1154 |
+
|
| 1155 |
+
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
|
| 1156 |
+
|
| 1157 |
+
This project demonstrates autonomous agent capabilities through intelligent orchestration
|
| 1158 |
+
of multiple AI tools to transform static research papers into dynamic audio content.
|
| 1159 |
+
|
| 1160 |
+
---
|
| 1161 |
+
|
| 1162 |
+
## 📝 About the Agent
|
| 1163 |
+
|
| 1164 |
+
PaperCast uses an autonomous agent that:
|
| 1165 |
+
|
| 1166 |
+
**Plans** conversation flow based on paper structure
|
| 1167 |
+
**Reasons** about which concepts need simplification
|
| 1168 |
+
**Executes** multi-step processing pipeline
|
| 1169 |
+
**Adapts** dialogue based on paper complexity
|
| 1170 |
+
|
| 1171 |
+
---
|
| 1172 |
+
|
| 1173 |
+
## 💡 Use Cases
|
| 1174 |
+
|
| 1175 |
+
🎧 Listen to papers during commute or exercise
|
| 1176 |
+
📚 Quick overview of research before deep reading
|
| 1177 |
+
🌍 Make research accessible to broader audiences
|
| 1178 |
+
🔬 Stay updated with latest papers in your field
|
| 1179 |
+
|
| 1180 |
+
---
|
| 1181 |
+
|
| 1182 |
+
Made with ❤️ using AI, Gradio, and ElevenLabs
|
| 1183 |
+
|
| 1184 |
+
</div>
|
| 1185 |
+
""")
|
| 1186 |
+
with gr.Column(scale=1):
|
| 1187 |
+
pass
|
| 1188 |
+
|
| 1189 |
+
# Update generate button state when settings_valid changes
|
| 1190 |
+
def update_generate_button_validity(settings_valid):
|
| 1191 |
+
return gr.update(interactive=settings_valid)
|
| 1192 |
+
|
| 1193 |
+
settings_valid.change(
|
| 1194 |
+
fn=update_generate_button_validity,
|
| 1195 |
+
inputs=[settings_valid],
|
| 1196 |
+
outputs=[generate_btn],
|
| 1197 |
+
)
|
| 1198 |
+
|
| 1199 |
+
demo.launch(server_name="0.0.0.0", share=True)
|
| 1200 |
+
|
| 1201 |
+
|
| 1202 |
+
if __name__ == "__main__":
|
| 1203 |
+
main()
|
generation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Script and dialogue generation for PaperCast"""
|
generation/script_generator.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
import httpx
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
|
| 7 |
+
from utils.config import (
|
| 8 |
+
DEMO_INFERENCE_KEY,
|
| 9 |
+
DEMO_INFERENCE_URL,
|
| 10 |
+
DEMO_MODE,
|
| 11 |
+
DEMO_MODEL,
|
| 12 |
+
MAX_TOKENS,
|
| 13 |
+
SCRIPT_GENERATION_MODEL,
|
| 14 |
+
TEMPERATURE,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ScriptGenerator:
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
provider_mode="demo",
|
| 22 |
+
own_base_url=None,
|
| 23 |
+
own_api_key=None,
|
| 24 |
+
own_model=None,
|
| 25 |
+
openai_key=None,
|
| 26 |
+
openai_model=None,
|
| 27 |
+
max_tokens=None,
|
| 28 |
+
):
|
| 29 |
+
"""
|
| 30 |
+
Initialize ScriptGenerator with flexible provider support.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
provider_mode: "demo", "own_inference", or "openai"
|
| 34 |
+
own_base_url: Base URL for own inference server
|
| 35 |
+
own_api_key: API key for own inference server
|
| 36 |
+
own_model: Model name for own inference server
|
| 37 |
+
openai_key: OpenAI API key
|
| 38 |
+
openai_model: OpenAI model name (e.g., "gpt-4o-mini", "gpt-4", "gpt-3.5-turbo")
|
| 39 |
+
max_tokens: Maximum tokens for generation
|
| 40 |
+
"""
|
| 41 |
+
self.provider_mode = provider_mode
|
| 42 |
+
self.max_tokens = max_tokens or MAX_TOKENS
|
| 43 |
+
|
| 44 |
+
if provider_mode == "demo":
|
| 45 |
+
# Demo mode - use hardcoded credentials
|
| 46 |
+
print(f"Using Demo Inference: {DEMO_INFERENCE_URL}")
|
| 47 |
+
username, password = DEMO_INFERENCE_KEY.split(":", 1)
|
| 48 |
+
http_client = httpx.Client(auth=(username, password))
|
| 49 |
+
self.client = OpenAI(
|
| 50 |
+
base_url=DEMO_INFERENCE_URL,
|
| 51 |
+
api_key="dummy",
|
| 52 |
+
http_client=http_client,
|
| 53 |
+
)
|
| 54 |
+
self.model_name = DEMO_MODEL
|
| 55 |
+
print("✓ Demo inference client initialized")
|
| 56 |
+
|
| 57 |
+
elif provider_mode == "own_inference":
|
| 58 |
+
# Own inference server
|
| 59 |
+
print(f"Connecting to own inference API: {own_base_url}")
|
| 60 |
+
|
| 61 |
+
if own_api_key:
|
| 62 |
+
# If API key is provided, check if it's in "username:password" format
|
| 63 |
+
if ":" in own_api_key:
|
| 64 |
+
username, password = own_api_key.split(":", 1)
|
| 65 |
+
http_client = httpx.Client(auth=(username, password))
|
| 66 |
+
self.client = OpenAI(
|
| 67 |
+
base_url=own_base_url,
|
| 68 |
+
api_key="dummy",
|
| 69 |
+
http_client=http_client,
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
# Regular API key
|
| 73 |
+
self.client = OpenAI(
|
| 74 |
+
base_url=own_base_url,
|
| 75 |
+
api_key=own_api_key,
|
| 76 |
+
)
|
| 77 |
+
else:
|
| 78 |
+
# No API key - some servers don't require it
|
| 79 |
+
self.client = OpenAI(
|
| 80 |
+
base_url=own_base_url,
|
| 81 |
+
api_key="dummy",
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
self.model_name = own_model or "default"
|
| 85 |
+
print(f"✓ Own inference client initialized (model: {self.model_name})")
|
| 86 |
+
|
| 87 |
+
elif provider_mode == "openai":
|
| 88 |
+
# OpenAI
|
| 89 |
+
print(f"Using OpenAI: {openai_model or 'gpt-4o-mini'}")
|
| 90 |
+
self.client = OpenAI(api_key=openai_key)
|
| 91 |
+
self.model_name = openai_model or "gpt-4o-mini"
|
| 92 |
+
print("✓ OpenAI client initialized")
|
| 93 |
+
|
| 94 |
+
else:
|
| 95 |
+
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
| 96 |
+
|
| 97 |
+
def generate_podcast_script(self, paper_text: str) -> list:
|
| 98 |
+
"""
|
| 99 |
+
Generates a podcast script from the given paper text.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
paper_text (str): The text content of the research paper.
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
list: A list of dictionaries representing the dialogue.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
system_prompt = """You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two hosts:
|
| 109 |
+
- Host (Alex): Enthusiastic, asks clarifying questions, guides the conversation.
|
| 110 |
+
- Guest (Jamie): Expert researcher, explains concepts simply but accurately.
|
| 111 |
+
|
| 112 |
+
CRITICAL RULES:
|
| 113 |
+
1. The Host MUST ALWAYS start with "Welcome to PaperCast!" - This is the show's branding and must never be skipped.
|
| 114 |
+
2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
|
| 115 |
+
3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
|
| 116 |
+
|
| 117 |
+
Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
|
| 118 |
+
- "speaker": "Host" or "Guest"
|
| 119 |
+
- "text": The dialogue text.
|
| 120 |
+
- "emotion": An emotion tag supported by the TTS engine (e.g., "excited", "neutral", "thoughtful", "happy").
|
| 121 |
+
|
| 122 |
+
Example:
|
| 123 |
+
[
|
| 124 |
+
{"speaker": "Host", "text": "Welcome to PaperCast! Today we're diving into something really cool.", "emotion": "excited"},
|
| 125 |
+
{"speaker": "Guest", "text": "That's right, Alex. We're looking at a new way to handle large language models.", "emotion": "happy"}
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
user_prompt = f"Here is the research paper text. Generate a podcast script summarizing the key findings, methodology, and implications.\n\n{paper_text[:10000]}..."
|
| 132 |
+
|
| 133 |
+
messages = [
|
| 134 |
+
{"role": "system", "content": system_prompt},
|
| 135 |
+
{"role": "user", "content": user_prompt},
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
print(
|
| 139 |
+
f"Generating script with {self.provider_mode} (model: {self.model_name})..."
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Call LLM
|
| 143 |
+
response = self.client.chat.completions.create(
|
| 144 |
+
model=self.model_name,
|
| 145 |
+
messages=messages,
|
| 146 |
+
max_tokens=self.max_tokens,
|
| 147 |
+
temperature=TEMPERATURE,
|
| 148 |
+
)
|
| 149 |
+
generated_text = response.choices[0].message.content
|
| 150 |
+
|
| 151 |
+
# Extract JSON from the response
|
| 152 |
+
try:
|
| 153 |
+
# Find the first '[' and last ']'
|
| 154 |
+
start_index = generated_text.find("[")
|
| 155 |
+
end_index = generated_text.rfind("]") + 1
|
| 156 |
+
if start_index != -1 and end_index != -1:
|
| 157 |
+
json_str = generated_text[start_index:end_index]
|
| 158 |
+
script = json.loads(json_str)
|
| 159 |
+
return script
|
| 160 |
+
else:
|
| 161 |
+
print("No JSON found in output.")
|
| 162 |
+
return []
|
| 163 |
+
except json.JSONDecodeError as e:
|
| 164 |
+
print(f"Error parsing JSON: {e}")
|
| 165 |
+
print(f"Raw output: {generated_text}")
|
| 166 |
+
return []
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# Global instance to avoid reloading model
|
| 170 |
+
_generator_instance = None
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def get_generator(
|
| 174 |
+
provider_mode="demo",
|
| 175 |
+
own_base_url=None,
|
| 176 |
+
own_api_key=None,
|
| 177 |
+
own_model=None,
|
| 178 |
+
openai_key=None,
|
| 179 |
+
openai_model=None,
|
| 180 |
+
max_tokens=None,
|
| 181 |
+
):
|
| 182 |
+
"""
|
| 183 |
+
Get a script generator instance with flexible provider support.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
provider_mode: "demo", "own_inference", or "openai"
|
| 187 |
+
own_base_url: Base URL for own inference server
|
| 188 |
+
own_api_key: API key for own inference server
|
| 189 |
+
own_model: Model name for own inference server
|
| 190 |
+
openai_key: OpenAI API key
|
| 191 |
+
openai_model: OpenAI model name (default: "gpt-4o-mini")
|
| 192 |
+
max_tokens: Maximum tokens for generation
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
ScriptGenerator instance
|
| 196 |
+
"""
|
| 197 |
+
global _generator_instance
|
| 198 |
+
|
| 199 |
+
# Always create new instance for OpenAI or own_inference with custom settings
|
| 200 |
+
# Reuse demo instance if same config
|
| 201 |
+
if provider_mode == "openai":
|
| 202 |
+
if not openai_key:
|
| 203 |
+
print(
|
| 204 |
+
"Warning: OpenAI selected but no API key provided. Falling back to demo mode."
|
| 205 |
+
)
|
| 206 |
+
provider_mode = "demo"
|
| 207 |
+
else:
|
| 208 |
+
return ScriptGenerator(
|
| 209 |
+
provider_mode="openai",
|
| 210 |
+
openai_key=openai_key,
|
| 211 |
+
openai_model=openai_model,
|
| 212 |
+
max_tokens=max_tokens or MAX_TOKENS,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
if provider_mode == "own_inference":
|
| 216 |
+
if not own_base_url:
|
| 217 |
+
print(
|
| 218 |
+
"Warning: Own Inference selected but no base URL provided. Falling back to demo mode."
|
| 219 |
+
)
|
| 220 |
+
provider_mode = "demo"
|
| 221 |
+
else:
|
| 222 |
+
return ScriptGenerator(
|
| 223 |
+
provider_mode="own_inference",
|
| 224 |
+
own_base_url=own_base_url,
|
| 225 |
+
own_api_key=own_api_key,
|
| 226 |
+
own_model=own_model,
|
| 227 |
+
max_tokens=max_tokens or MAX_TOKENS,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Demo mode - reuse global instance
|
| 231 |
+
if _generator_instance is None or provider_mode == "demo":
|
| 232 |
+
_generator_instance = ScriptGenerator(
|
| 233 |
+
provider_mode="demo",
|
| 234 |
+
max_tokens=max_tokens or MAX_TOKENS,
|
| 235 |
+
)
|
| 236 |
+
return _generator_instance
|
live.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import subprocess
|
| 3 |
+
import datetime
|
| 4 |
+
|
| 5 |
+
# ---------------------------------------------------------------------------
|
| 6 |
+
# Lütfen curl komutunuzu tırnak işaretleri arasına yapıştırın.
|
| 7 |
+
# Örnek: curl -X POST http://api.example.com/update
|
| 8 |
+
# ---------------------------------------------------------------------------
|
| 9 |
+
CURL_COMMAND = """
|
| 10 |
+
curl --location 'https://8000-dep-01kady4n8bfqjjatmpqtzhdcp9-d.cloudspaces.litng.ai/v1/chat/completions' \
|
| 11 |
+
--header 'Content-Type: application/json' \
|
| 12 |
+
--header 'Authorization: Basic YmF0dTpCYXR1aGFuMTIz' \
|
| 13 |
+
--data '{
|
| 14 |
+
"model": "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit",
|
| 15 |
+
"messages": [
|
| 16 |
+
{
|
| 17 |
+
"role": "user",
|
| 18 |
+
"content": "You are a helpful assistant. How manny letters in strawberry?"
|
| 19 |
+
}
|
| 20 |
+
]
|
| 21 |
+
}'
|
| 22 |
+
"""
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
def run_periodically():
|
| 26 |
+
print(f"Script başlatıldı: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 27 |
+
print(f"Komut: {CURL_COMMAND.strip()}")
|
| 28 |
+
print("-" * 50)
|
| 29 |
+
|
| 30 |
+
while True:
|
| 31 |
+
try:
|
| 32 |
+
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 33 |
+
print(f"[{current_time}] İstek gönderiliyor...")
|
| 34 |
+
|
| 35 |
+
# shell=True, komutun terminaldeki gibi çalışmasını sağlar
|
| 36 |
+
result = subprocess.run(CURL_COMMAND, shell=True, capture_output=True, text=True)
|
| 37 |
+
|
| 38 |
+
if result.returncode == 0:
|
| 39 |
+
print(f"Başarılı! Çıktı (ilk 100 karakter): {result.stdout[:100]}...")
|
| 40 |
+
else:
|
| 41 |
+
print(f"Hata kodu: {result.returncode}")
|
| 42 |
+
print(f"Hata çıktısı: {result.stderr}")
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"Beklenmedik bir hata oluştu: {e}")
|
| 46 |
+
|
| 47 |
+
print("60 saniye bekleniyor...")
|
| 48 |
+
print("-" * 50)
|
| 49 |
+
time.sleep(60)
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
run_periodically()
|
mcp_servers/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""MCP server integrations for PaperCast"""
|
mcp_servers/paper_tools_server.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mcp.server.fastmcp import FastMCP
|
| 2 |
+
from processing.pdf_reader import extract_text_from_pdf
|
| 3 |
+
from processing.url_fetcher import fetch_paper_from_url
|
| 4 |
+
from generation.script_generator import get_generator
|
| 5 |
+
from synthesis.tts_engine import get_tts_engine
|
| 6 |
+
|
| 7 |
+
mcp = FastMCP("PaperCast Tools")
|
| 8 |
+
|
| 9 |
+
@mcp.tool()
|
| 10 |
+
def read_pdf(path: str) -> str:
|
| 11 |
+
"""Reads text from a PDF file."""
|
| 12 |
+
return extract_text_from_pdf(path)
|
| 13 |
+
|
| 14 |
+
@mcp.tool()
|
| 15 |
+
def fetch_arxiv(url: str) -> str:
|
| 16 |
+
"""Downloads a paper from an arXiv URL and returns the file path."""
|
| 17 |
+
return fetch_paper_from_url(url)
|
| 18 |
+
|
| 19 |
+
@mcp.tool()
|
| 20 |
+
def synthesize_podcast(script: list) -> str:
|
| 21 |
+
"""Synthesizes a podcast from a script (list of dicts). Returns audio path."""
|
| 22 |
+
tts = get_tts_engine()
|
| 23 |
+
return tts.synthesize_dialogue(script)
|
| 24 |
+
|
| 25 |
+
@mcp.tool()
|
| 26 |
+
def generate_script(text: str) -> list:
|
| 27 |
+
"""Generates a podcast script from text."""
|
| 28 |
+
generator = get_generator()
|
| 29 |
+
return generator.generate_podcast_script(text)
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
mcp.run()
|
output/history.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"url": "https://arxiv.org/abs/2511.14650",
|
| 4 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast.wav",
|
| 5 |
+
"script_length": "N/A",
|
| 6 |
+
"timestamp": "2025-11-19 16:32:00",
|
| 7 |
+
"audio_filename": "podcast.wav"
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"url": "https://www.medrxiv.org/content/10.1101/2025.11.14.25340242v1",
|
| 11 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_170124.wav",
|
| 12 |
+
"script_length": 11,
|
| 13 |
+
"timestamp": "2025-11-19 17:01:24",
|
| 14 |
+
"audio_filename": "podcast_20251119_170124.wav"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"url": "Uploaded PDF",
|
| 18 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_170817.wav",
|
| 19 |
+
"script_length": 13,
|
| 20 |
+
"timestamp": "2025-11-19 17:08:17",
|
| 21 |
+
"audio_filename": "podcast_20251119_170817.wav"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"url": "https://arxiv.org/abs/2511.14650",
|
| 25 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_210844.wav",
|
| 26 |
+
"script_length": 15,
|
| 27 |
+
"timestamp": "2025-11-19 21:08:44",
|
| 28 |
+
"audio_filename": "podcast_20251119_210844.wav"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"url": "https://arxiv.org/abs/2401.08406",
|
| 32 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_212141.wav",
|
| 33 |
+
"script_length": 14,
|
| 34 |
+
"timestamp": "2025-11-19 21:21:41",
|
| 35 |
+
"audio_filename": "podcast_20251119_212141.wav"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"url": "https://arxiv.org/abs/1706.03762",
|
| 39 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_230027.wav",
|
| 40 |
+
"script_length": 12,
|
| 41 |
+
"timestamp": "2025-11-19 23:00:27",
|
| 42 |
+
"audio_filename": "podcast_20251119_230027.wav"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"url": "https://arxiv.org/abs/2303.08774",
|
| 46 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_230323.wav",
|
| 47 |
+
"script_length": 17,
|
| 48 |
+
"timestamp": "2025-11-19 23:03:23",
|
| 49 |
+
"audio_filename": "podcast_20251119_230323.wav"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"url": "https://www.medrxiv.org/content/10.1101/2025.05.25.25328317v2",
|
| 53 |
+
"audio_path": "/home/batuhan/lab/papercast/output/podcast_20251119_230742.wav",
|
| 54 |
+
"script_length": 11,
|
| 55 |
+
"timestamp": "2025-11-19 23:07:42",
|
| 56 |
+
"audio_filename": "podcast_20251119_230742.wav"
|
| 57 |
+
}
|
| 58 |
+
]
|
plan.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PaperCast Implementation Plan
|
| 2 |
+
|
| 3 |
+
This plan outlines the steps to build **PaperCast**, an AI agent that converts research papers into podcast-style conversations using MCP, Gradio, and LLMs.
|
| 4 |
+
|
| 5 |
+
## 1. Infrastructure & Dependencies
|
| 6 |
+
|
| 7 |
+
- [ ] **Update `requirements.txt`**
|
| 8 |
+
- Add `transformers`, `accelerate`, `bitsandbytes` (for 4-bit LLM loading).
|
| 9 |
+
- Add `scipy` (for audio processing).
|
| 10 |
+
- Add `beautifulsoup4` (for web parsing).
|
| 11 |
+
- Add `python-multipart` (for API handling).
|
| 12 |
+
- Ensure `mcp` and `gradio` versions are pinned.
|
| 13 |
+
- [ ] **Project Structure Setup**
|
| 14 |
+
- Create `app.py` (entry point).
|
| 15 |
+
- Ensure `__init__.py` in all subdirs.
|
| 16 |
+
- Create `config.py` in `utils/` for global settings (LLM model names, paths).
|
| 17 |
+
|
| 18 |
+
## 2. Core Processing Modules
|
| 19 |
+
|
| 20 |
+
### 2.1. PDF Processing (`processing/`)
|
| 21 |
+
- [ ] **Implement `pdf_reader.py`**
|
| 22 |
+
- Function `extract_text_from_pdf(pdf_path) -> str`.
|
| 23 |
+
- Use `PyMuPDF` (fitz) for fast extraction.
|
| 24 |
+
- Implement basic cleaning (remove headers/footers/references if possible).
|
| 25 |
+
- [ ] **Implement `url_fetcher.py`**
|
| 26 |
+
- Function `fetch_paper_from_url(url) -> str`.
|
| 27 |
+
- Handle arXiv URLs (convert `/abs/` to `/pdf/` or scrape abstract).
|
| 28 |
+
- Download PDF to temporary storage.
|
| 29 |
+
|
| 30 |
+
### 2.2. Generation Logic (`generation/`)
|
| 31 |
+
- [ ] **Implement `script_generator.py`**
|
| 32 |
+
- **Model**: `unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit`.
|
| 33 |
+
- Define System Prompts for "Host" and "Guest" personas.
|
| 34 |
+
- Function `generate_podcast_script(paper_text) -> List[Dict]`.
|
| 35 |
+
- Output format: `[{"speaker": "Host", "text": "...", "emotion": "excited"}, {"speaker": "Guest", "text": "...", "emotion": "neutral"}]`.
|
| 36 |
+
- **Key Logic**: Prompt the model to include emotion tags (e.g. `[laugh]`, `[sigh]`) supported by Maya1.
|
| 37 |
+
|
| 38 |
+
### 2.3. Audio Synthesis (`synthesis/`)
|
| 39 |
+
- [ ] **Implement `tts_engine.py`**
|
| 40 |
+
- **Model**: `maya-research/maya1`.
|
| 41 |
+
- Function `synthesize_dialogue(script_json) -> audio_path`.
|
| 42 |
+
- Parse the script for emotion tags and pass them to Maya1.
|
| 43 |
+
- Combine audio segments into a single file using `pydub` or `scipy`.
|
| 44 |
+
|
| 45 |
+
## 3. MCP Server Integration (`mcp_servers/`)
|
| 46 |
+
|
| 47 |
+
To satisfy the "MCP in Action" requirement, we will expose our core tools as MCP resources/tools.
|
| 48 |
+
|
| 49 |
+
- [ ] **Create `paper_tools_server.py`**
|
| 50 |
+
- Implement an MCP server that provides:
|
| 51 |
+
- Tool: `read_pdf(path)`
|
| 52 |
+
- Tool: `fetch_arxiv(url)`
|
| 53 |
+
- Tool: `synthesize_podcast(script)`
|
| 54 |
+
- This allows the "Agent" to call these tools via the MCP protocol.
|
| 55 |
+
|
| 56 |
+
## 4. Agent Orchestration (`agents/`)
|
| 57 |
+
|
| 58 |
+
- [ ] **Implement `podcast_agent.py`**
|
| 59 |
+
- Create a `PodcastAgent` class.
|
| 60 |
+
- **Planning Loop**:
|
| 61 |
+
1. Receive User Input.
|
| 62 |
+
2. **Plan**: Decide to fetch/read paper.
|
| 63 |
+
3. **Analyze**: Extract key topics.
|
| 64 |
+
4. **Draft**: Generate script using Phi-4-mini.
|
| 65 |
+
5. **Synthesize**: Create audio using Maya1.
|
| 66 |
+
- Use `sequential_thinking` pattern (simulated) to show "Agentic" behavior in the logs/UI.
|
| 67 |
+
- *Crucial*: The Agent should use the MCP Client to call the tools defined in Step 3, demonstrating "Autonomous reasoning using MCP tools".
|
| 68 |
+
|
| 69 |
+
## 5. User Interface (`app.py`)
|
| 70 |
+
|
| 71 |
+
- [ ] **Build Gradio UI**
|
| 72 |
+
- Input: Textbox (URL) or File Upload (PDF).
|
| 73 |
+
- Output: Audio Player, Transcript Textbox, Status/Logs Markdown.
|
| 74 |
+
- **Agent Visualization**: Show the "Thoughts" of the agent as it plans and executes (e.g., "Fetching paper...", "Analyzing structure...", "Generating script...").
|
| 75 |
+
- [ ] **Deployment Config**
|
| 76 |
+
- Create `Dockerfile` (if needed for custom deps) or rely on HF Spaces default.
|
| 77 |
+
|
| 78 |
+
## 6. Verification & Polish
|
| 79 |
+
|
| 80 |
+
- [ ] **Test Run**
|
| 81 |
+
- Run with a real arXiv paper.
|
| 82 |
+
- Verify audio quality and script coherence.
|
| 83 |
+
- [ ] **Documentation**
|
| 84 |
+
- Update `README.md` with usage instructions and "MCP in Action" details.
|
| 85 |
+
- Record Demo Video.
|
| 86 |
+
|
| 87 |
+
## 7. Bonus Features (Time Permitting)
|
| 88 |
+
|
| 89 |
+
- [ ] **RAG Integration**: Use a vector store to answer questions about the paper after the podcast.
|
| 90 |
+
- [ ] **Background Music**: Mix in intro/outro music.
|
processing/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""PDF extraction and text processing for PaperCast"""
|
processing/pdf_reader.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
|
| 3 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 4 |
+
"""
|
| 5 |
+
Extracts text from a PDF file using PyMuPDF.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
pdf_path (str): Path to the PDF file.
|
| 9 |
+
|
| 10 |
+
Returns:
|
| 11 |
+
str: Extracted text content.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
doc = fitz.open(pdf_path)
|
| 15 |
+
text = ""
|
| 16 |
+
for page in doc:
|
| 17 |
+
text += page.get_text()
|
| 18 |
+
return text
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"Error reading PDF {pdf_path}: {e}")
|
| 21 |
+
return ""
|
processing/url_fetcher.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from urllib.parse import urlparse
|
| 4 |
+
from utils.config import TEMP_DIR
|
| 5 |
+
|
| 6 |
+
def fetch_paper_from_url(url: str) -> str:
|
| 7 |
+
"""
|
| 8 |
+
Downloads a PDF from a URL (supports arXiv and medRxiv).
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
url (str): The URL of the paper.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
str: Path to the downloaded PDF file.
|
| 15 |
+
"""
|
| 16 |
+
# Handle arXiv abstract URLs
|
| 17 |
+
if "arxiv.org/abs/" in url:
|
| 18 |
+
url = url.replace("/abs/", "/pdf/")
|
| 19 |
+
if not url.endswith(".pdf"):
|
| 20 |
+
url += ".pdf"
|
| 21 |
+
|
| 22 |
+
# Handle medRxiv URLs
|
| 23 |
+
# Example: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1
|
| 24 |
+
# or: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1.full.pdf
|
| 25 |
+
elif "medrxiv.org/content/" in url:
|
| 26 |
+
if not url.endswith(".pdf"):
|
| 27 |
+
url = url + ".full.pdf"
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
# Add headers to avoid 403 Forbidden errors from bioRxiv/medRxiv
|
| 31 |
+
headers = {
|
| 32 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 33 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 34 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 35 |
+
'Connection': 'keep-alive',
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
response = requests.get(url, stream=True, headers=headers, timeout=30)
|
| 39 |
+
response.raise_for_status()
|
| 40 |
+
|
| 41 |
+
# Extract filename from URL or use default
|
| 42 |
+
parsed_url = urlparse(url)
|
| 43 |
+
filename = os.path.basename(parsed_url.path)
|
| 44 |
+
if not filename.endswith(".pdf"):
|
| 45 |
+
filename = "downloaded_paper.pdf"
|
| 46 |
+
|
| 47 |
+
file_path = os.path.join(TEMP_DIR, filename)
|
| 48 |
+
|
| 49 |
+
with open(file_path, "wb") as f:
|
| 50 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 51 |
+
f.write(chunk)
|
| 52 |
+
|
| 53 |
+
return file_path
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"Error downloading {url}: {e}")
|
| 56 |
+
return ""
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4
|
| 2 |
+
edge-tts
|
| 3 |
+
elevenlabs
|
| 4 |
+
gradio
|
| 5 |
+
mcp
|
| 6 |
+
openai
|
| 7 |
+
pydub
|
| 8 |
+
python-dotenv
|
| 9 |
+
python-multipart
|
| 10 |
+
pymupdf
|
| 11 |
+
requests
|
| 12 |
+
scipy
|
synthesis/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Text-to-speech audio generation for PaperCast"""
|
synthesis/tts_engine.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
|
| 6 |
+
import edge_tts
|
| 7 |
+
from elevenlabs import ElevenLabs, VoiceSettings
|
| 8 |
+
from pydub import AudioSegment
|
| 9 |
+
|
| 10 |
+
from utils.config import (
|
| 11 |
+
ELEVENLABS_API_KEY,
|
| 12 |
+
ELEVENLABS_GUEST_VOICE,
|
| 13 |
+
ELEVENLABS_HOST_VOICE,
|
| 14 |
+
OUTPUT_DIR,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Edge-TTS Voice Options
|
| 18 |
+
EDGE_TTS_VOICES = {
|
| 19 |
+
# English (US) - Male
|
| 20 |
+
"Guy (US Male - Casual)": "en-US-GuyNeural",
|
| 21 |
+
"Christopher (US Male - Authoritative)": "en-US-ChristopherNeural",
|
| 22 |
+
"Eric (US Male - Professional)": "en-US-EricNeural",
|
| 23 |
+
"Steffan (US Male - Energetic)": "en-US-SteffanNeural",
|
| 24 |
+
"Roger (US Male - Elderly)": "en-US-RogerNeural",
|
| 25 |
+
|
| 26 |
+
# English (US) - Female
|
| 27 |
+
"Jenny (US Female - Friendly)": "en-US-JennyNeural",
|
| 28 |
+
"Aria (US Female - Professional)": "en-US-AriaNeural",
|
| 29 |
+
"Michelle (US Female - Enthusiastic)": "en-US-MichelleNeural",
|
| 30 |
+
"Sara (US Female - News Anchor)": "en-US-SaraNeural",
|
| 31 |
+
"Ana (US Female - Child)": "en-US-AnaNeural",
|
| 32 |
+
|
| 33 |
+
# English (UK)
|
| 34 |
+
"Ryan (UK Male)": "en-GB-RyanNeural",
|
| 35 |
+
"Thomas (UK Male - Elderly)": "en-GB-ThomasNeural",
|
| 36 |
+
"Sonia (UK Female)": "en-GB-SoniaNeural",
|
| 37 |
+
"Libby (UK Female - Enthusiastic)": "en-GB-LibbyNeural",
|
| 38 |
+
|
| 39 |
+
# English (Australia)
|
| 40 |
+
"William (AU Male)": "en-AU-WilliamNeural",
|
| 41 |
+
"Natasha (AU Female)": "en-AU-NatashaNeural",
|
| 42 |
+
|
| 43 |
+
# English (India)
|
| 44 |
+
"Prabhat (IN Male)": "en-IN-PrabhatNeural",
|
| 45 |
+
"Neerja (IN Female)": "en-IN-NeerjaNeural",
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# ElevenLabs Voice Options (popular voices)
|
| 49 |
+
ELEVENLABS_VOICES = {
|
| 50 |
+
# Male Voices
|
| 51 |
+
"Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
|
| 52 |
+
"Josh (Male - Deep)": "TxGEqnHWrfWFTfGW9XjX",
|
| 53 |
+
"Arnold (Male - Crisp)": "VR6AewLTigWG4xSOukaG",
|
| 54 |
+
"Callum (Male - Hoarse)": "N2lVS1w4EtoT3dr4eOWO",
|
| 55 |
+
"Charlie (Male - Casual)": "IKne3meq5aSn9XLyUdCD",
|
| 56 |
+
"Clyde (Male - War veteran)": "2EiwWnXFnvU5JabPnv8n",
|
| 57 |
+
"Daniel (Male - Deep British)": "onwK4e9ZLuTAKqWW03F9",
|
| 58 |
+
"Ethan (Male - Young American)": "g5CIjZEefAph4nQFvHAz",
|
| 59 |
+
"Fin (Male - Irish)": "D38z5RcWu1voky8WS1ja",
|
| 60 |
+
"George (Male - British)": "JBFqnCBsd6RMkjVDRZzb",
|
| 61 |
+
|
| 62 |
+
# Female Voices
|
| 63 |
+
"Bella (Female - Soft)": "EXAVITQu4vr4xnSDxMaL",
|
| 64 |
+
"Rachel (Female - Calm)": "21m00Tcm4TlvDq8ikWAM",
|
| 65 |
+
"Domi (Female - Strong)": "AZnzlk1XvdvUeBnXmlld",
|
| 66 |
+
"Elli (Female - Emotional)": "MF3mGyEYCl7XYWbV9V6O",
|
| 67 |
+
"Emily (Female - Calm British)": "LcfcDJNUP1GQjkzn1xUU",
|
| 68 |
+
"Freya (Female - Young American)": "jsCqWAovK2LkecY7zXl4",
|
| 69 |
+
"Gigi (Female - Young Expressive)": "jBpfuIE2acCO8z3wKNLl",
|
| 70 |
+
"Grace (Female - Southern American)": "oWAxZDx7w5VEj9dCyTzz",
|
| 71 |
+
"Lily (Female - Warm British)": "pFZP5JQG7iQjIQuC4Bku",
|
| 72 |
+
"Matilda (Female - Warm)": "XrExE9yKIg1WjnnlVkGX",
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate_unique_filename():
|
| 77 |
+
"""Generate unique filename using timestamp"""
|
| 78 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 79 |
+
return f"podcast_{timestamp}.wav"
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class TTSEngine:
|
| 83 |
+
def __init__(self, tts_provider="edge-tts", custom_api_key=None, host_voice=None, guest_voice=None):
|
| 84 |
+
"""
|
| 85 |
+
Initialize TTS Engine with specified provider.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
tts_provider: "edge-tts" or "elevenlabs"
|
| 89 |
+
custom_api_key: API key for ElevenLabs (only used if provider is "elevenlabs")
|
| 90 |
+
host_voice: Voice ID/name for Host (optional, uses default if not provided)
|
| 91 |
+
guest_voice: Voice ID/name for Guest (optional, uses default if not provided)
|
| 92 |
+
"""
|
| 93 |
+
self.mode = tts_provider.lower()
|
| 94 |
+
|
| 95 |
+
if self.mode == "elevenlabs":
|
| 96 |
+
print("Initializing ElevenLabs TTS API...")
|
| 97 |
+
# Use custom key if provided, otherwise use default
|
| 98 |
+
api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY
|
| 99 |
+
self.client = ElevenLabs(api_key=api_key)
|
| 100 |
+
|
| 101 |
+
# Use custom voices or defaults
|
| 102 |
+
self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
|
| 103 |
+
self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE
|
| 104 |
+
|
| 105 |
+
if custom_api_key:
|
| 106 |
+
print("✓ ElevenLabs TTS ready (custom API key)")
|
| 107 |
+
else:
|
| 108 |
+
print("✓ ElevenLabs TTS ready (demo API key)")
|
| 109 |
+
|
| 110 |
+
# Print selected voices
|
| 111 |
+
host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
|
| 112 |
+
guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
|
| 113 |
+
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
|
| 114 |
+
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
|
| 115 |
+
|
| 116 |
+
elif self.mode == "edge-tts":
|
| 117 |
+
print("Initializing Edge-TTS (Microsoft)...")
|
| 118 |
+
# Use custom voices or defaults
|
| 119 |
+
self.host_voice = host_voice if host_voice else "en-US-GuyNeural"
|
| 120 |
+
self.guest_voice = guest_voice if guest_voice else "en-US-JennyNeural"
|
| 121 |
+
print("✓ Edge-TTS ready (free, no API key required)")
|
| 122 |
+
|
| 123 |
+
# Print selected voices
|
| 124 |
+
host_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.host_voice]
|
| 125 |
+
guest_name = [k for k, v in EDGE_TTS_VOICES.items() if v == self.guest_voice]
|
| 126 |
+
print(f" Host: {host_name[0] if host_name else 'Custom/Default'}")
|
| 127 |
+
print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}")
|
| 128 |
+
|
| 129 |
+
else:
|
| 130 |
+
raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'edge-tts' or 'elevenlabs'")
|
| 131 |
+
|
| 132 |
+
def synthesize_dialogue(self, script: list) -> str:
|
| 133 |
+
"""
|
| 134 |
+
Synthesize the script to audio using selected TTS provider.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
script: List of dialogue items
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
str: Path to the generated audio file
|
| 141 |
+
"""
|
| 142 |
+
if self.mode == "elevenlabs":
|
| 143 |
+
return self._synthesize_elevenlabs(script)
|
| 144 |
+
elif self.mode == "edge-tts":
|
| 145 |
+
return self._synthesize_edge_tts(script)
|
| 146 |
+
else:
|
| 147 |
+
raise ValueError(f"Unknown TTS mode: {self.mode}")
|
| 148 |
+
|
| 149 |
+
def _synthesize_elevenlabs(self, script: list) -> str:
|
| 150 |
+
"""Synthesize using ElevenLabs API"""
|
| 151 |
+
print("Synthesizing audio via ElevenLabs API...")
|
| 152 |
+
audio_segments = []
|
| 153 |
+
|
| 154 |
+
for i, item in enumerate(script):
|
| 155 |
+
text = item["text"]
|
| 156 |
+
speaker = item["speaker"]
|
| 157 |
+
|
| 158 |
+
# Select voice based on speaker
|
| 159 |
+
voice_id = self.guest_voice_id if speaker == "Guest" else self.host_voice_id
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
|
| 163 |
+
|
| 164 |
+
# Generate audio using ElevenLabs
|
| 165 |
+
audio_generator = self.client.text_to_speech.convert(
|
| 166 |
+
voice_id=voice_id,
|
| 167 |
+
text=text,
|
| 168 |
+
model_id="eleven_multilingual_v2",
|
| 169 |
+
voice_settings=VoiceSettings(
|
| 170 |
+
stability=0.5,
|
| 171 |
+
similarity_boost=0.75,
|
| 172 |
+
style=0.5,
|
| 173 |
+
use_speaker_boost=True,
|
| 174 |
+
),
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Collect audio bytes
|
| 178 |
+
audio_bytes = b"".join(audio_generator)
|
| 179 |
+
|
| 180 |
+
# Convert to AudioSegment
|
| 181 |
+
audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
|
| 182 |
+
audio_segments.append(audio_segment)
|
| 183 |
+
|
| 184 |
+
# Add 500ms silence between speakers
|
| 185 |
+
silence = AudioSegment.silent(duration=500)
|
| 186 |
+
audio_segments.append(silence)
|
| 187 |
+
|
| 188 |
+
print(f"✓ Synthesized line {i + 1}/{len(script)}")
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Error synthesizing line '{text[:50]}...': {e}")
|
| 192 |
+
# Continue with next line even if one fails
|
| 193 |
+
|
| 194 |
+
if not audio_segments:
|
| 195 |
+
print("No audio generated")
|
| 196 |
+
return ""
|
| 197 |
+
|
| 198 |
+
# Combine all segments
|
| 199 |
+
print("Combining audio segments...")
|
| 200 |
+
combined = sum(audio_segments)
|
| 201 |
+
|
| 202 |
+
# Export as WAV with unique filename
|
| 203 |
+
filename = generate_unique_filename()
|
| 204 |
+
output_path = os.path.join(OUTPUT_DIR, filename)
|
| 205 |
+
combined.export(output_path, format="wav")
|
| 206 |
+
print(f"✓ Podcast saved to: {output_path}")
|
| 207 |
+
|
| 208 |
+
return output_path
|
| 209 |
+
|
| 210 |
+
def _synthesize_edge_tts(self, script: list) -> str:
|
| 211 |
+
"""Synthesize using Edge-TTS (Microsoft)"""
|
| 212 |
+
print("Synthesizing audio via Edge-TTS (Microsoft)...")
|
| 213 |
+
audio_segments = []
|
| 214 |
+
|
| 215 |
+
for i, item in enumerate(script):
|
| 216 |
+
text = item["text"]
|
| 217 |
+
speaker = item["speaker"]
|
| 218 |
+
|
| 219 |
+
# Select voice based on speaker
|
| 220 |
+
voice = self.guest_voice if speaker == "Guest" else self.host_voice
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker})...")
|
| 224 |
+
|
| 225 |
+
# Generate audio using Edge-TTS (synchronous wrapper for async)
|
| 226 |
+
audio_bytes = asyncio.run(self._edge_tts_synthesize(text, voice))
|
| 227 |
+
|
| 228 |
+
# Convert to AudioSegment
|
| 229 |
+
audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
|
| 230 |
+
|
| 231 |
+
# Trim silence from the end of the audio (Edge-TTS adds trailing silence)
|
| 232 |
+
# Detect silence threshold: -40 dBFS
|
| 233 |
+
audio_segment = self._trim_silence(audio_segment)
|
| 234 |
+
|
| 235 |
+
audio_segments.append(audio_segment)
|
| 236 |
+
|
| 237 |
+
# Add minimal silence between speakers (50ms for natural flow)
|
| 238 |
+
silence = AudioSegment.silent(duration=50)
|
| 239 |
+
audio_segments.append(silence)
|
| 240 |
+
|
| 241 |
+
print(f"✓ Synthesized line {i + 1}/{len(script)}")
|
| 242 |
+
|
| 243 |
+
except Exception as e:
|
| 244 |
+
print(f"Error synthesizing line '{text[:50]}...': {e}")
|
| 245 |
+
# Continue with next line even if one fails
|
| 246 |
+
|
| 247 |
+
if not audio_segments:
|
| 248 |
+
print("No audio generated")
|
| 249 |
+
return ""
|
| 250 |
+
|
| 251 |
+
# Combine all segments
|
| 252 |
+
print("Combining audio segments...")
|
| 253 |
+
combined = sum(audio_segments)
|
| 254 |
+
|
| 255 |
+
# Export as WAV with unique filename
|
| 256 |
+
filename = generate_unique_filename()
|
| 257 |
+
output_path = os.path.join(OUTPUT_DIR, filename)
|
| 258 |
+
combined.export(output_path, format="wav")
|
| 259 |
+
print(f"✓ Podcast saved to: {output_path}")
|
| 260 |
+
|
| 261 |
+
return output_path
|
| 262 |
+
|
| 263 |
+
async def _edge_tts_synthesize(self, text: str, voice: str) -> bytes:
|
| 264 |
+
"""
|
| 265 |
+
Async helper to synthesize text using Edge-TTS.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
text: Text to synthesize
|
| 269 |
+
voice: Voice name to use
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
bytes: Audio data in MP3 format
|
| 273 |
+
"""
|
| 274 |
+
communicate = edge_tts.Communicate(text, voice)
|
| 275 |
+
audio_data = b""
|
| 276 |
+
|
| 277 |
+
async for chunk in communicate.stream():
|
| 278 |
+
if chunk["type"] == "audio":
|
| 279 |
+
audio_data += chunk["data"]
|
| 280 |
+
|
| 281 |
+
return audio_data
|
| 282 |
+
|
| 283 |
+
def _trim_silence(self, audio_segment, silence_thresh=-40, chunk_size=10):
|
| 284 |
+
"""
|
| 285 |
+
Trim silence from the end of audio segment.
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
audio_segment: AudioSegment to trim
|
| 289 |
+
silence_thresh: Silence threshold in dBFS (default: -40)
|
| 290 |
+
chunk_size: Size of chunks to analyze in ms (default: 10)
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
Trimmed AudioSegment
|
| 294 |
+
"""
|
| 295 |
+
# Start from the end and find where audio actually ends
|
| 296 |
+
trim_ms = 0
|
| 297 |
+
|
| 298 |
+
# Check from the end in chunks
|
| 299 |
+
for i in range(len(audio_segment) - chunk_size, 0, -chunk_size):
|
| 300 |
+
chunk = audio_segment[i:i + chunk_size]
|
| 301 |
+
if chunk.dBFS > silence_thresh:
|
| 302 |
+
# Found non-silent audio
|
| 303 |
+
trim_ms = i + chunk_size
|
| 304 |
+
break
|
| 305 |
+
|
| 306 |
+
# If we found non-silent audio, trim there
|
| 307 |
+
if trim_ms > 0:
|
| 308 |
+
return audio_segment[:trim_ms]
|
| 309 |
+
|
| 310 |
+
# Otherwise return original
|
| 311 |
+
return audio_segment
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
# Global instance
|
| 315 |
+
_tts_instance = None
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def get_tts_engine(tts_provider="edge-tts", custom_api_key=None, host_voice=None, guest_voice=None):
|
| 319 |
+
"""
|
| 320 |
+
Get TTS engine instance with optional provider, API key, and voices.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
tts_provider: "edge-tts" or "elevenlabs" (default: "edge-tts")
|
| 324 |
+
custom_api_key: Optional custom ElevenLabs API key (only used for ElevenLabs)
|
| 325 |
+
host_voice: Voice ID/name for Host (optional)
|
| 326 |
+
guest_voice: Voice ID/name for Guest (optional)
|
| 327 |
+
|
| 328 |
+
Returns:
|
| 329 |
+
TTSEngine instance
|
| 330 |
+
"""
|
| 331 |
+
global _tts_instance
|
| 332 |
+
|
| 333 |
+
# Always create new instance if custom settings provided
|
| 334 |
+
if custom_api_key or tts_provider != "edge-tts" or host_voice or guest_voice:
|
| 335 |
+
return TTSEngine(
|
| 336 |
+
tts_provider=tts_provider,
|
| 337 |
+
custom_api_key=custom_api_key,
|
| 338 |
+
host_voice=host_voice,
|
| 339 |
+
guest_voice=guest_voice
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
# Otherwise, reuse global instance (for default Edge-TTS)
|
| 343 |
+
if _tts_instance is None:
|
| 344 |
+
_tts_instance = TTSEngine(tts_provider="edge-tts")
|
| 345 |
+
return _tts_instance
|
todo.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PaperCast New features implementations
|
| 2 |
+
|
| 3 |
+
## Vision
|
| 4 |
+
We are not building "another paper summarizer".
|
| 5 |
+
We are building **the world's first interactive, multi-modal, counterfactual-aware, visually-synced academic podcast studio** powered by MCP tools, Gradio 6, Marker-pdf, Semantic Scholar, arXiv and ElevenLabs.
|
| 6 |
+
|
| 7 |
+
We invented 4 original frameworks that will be heavily emphasized in the demo and submission:
|
| 8 |
+
|
| 9 |
+
- **PPF** — Podcast Persona Framework
|
| 10 |
+
- **PVF** — Paper Visual Framework
|
| 11 |
+
- **PAD** — Paper Auto-Discovery
|
| 12 |
+
- **CPM** — Counterfactual Paper Mode
|
| 13 |
+
|
| 14 |
+
We will constantly refer to these acronyms in the demo:
|
| 15 |
+
"We created the Podcast Persona Framework (PPF) to solve the one-size-fits-all podcast problem" → instant "wow this is professional" effect.
|
| 16 |
+
|
| 17 |
+
## Mandatory Upgrades (Must be done first)
|
| 18 |
+
|
| 19 |
+
1. **Full Gradio 6.x Upgrade**
|
| 20 |
+
- Entire project migrates to Gradio ≥6.0
|
| 21 |
+
- Replace gr.ChatInterface with manual gr.Chatbot + gr.Row/gr.Column structure (for full UI control)
|
| 22 |
+
- Launch with `launch(mcp=True)` → automatic MCP server on HF Spaces
|
| 23 |
+
- Full streaming support (ElevenLabs + partial transcript updates)
|
| 24 |
+
|
| 25 |
+
2. **MCP PDF Server → Marker-pdf Ultra Edition**
|
| 26 |
+
- Server name: `marker-pdf-mcp-server`
|
| 27 |
+
- Output: **clean GitHub-flavored markdown** (LaTeX equations preserved with $$...$$, real markdown tables, figure/table metadata with page numbers)
|
| 28 |
+
- New parameter: `extract_mode` → "full" | "smart_focus" (Claude automatically selects the 3-4 most important sections)
|
| 29 |
+
- Tool name in agent: `extract_paper_with_marker`
|
| 30 |
+
|
| 31 |
+
## Core Features
|
| 32 |
+
|
| 33 |
+
### 1. Podcast Persona Framework (PPF) — Killer Feature #1
|
| 34 |
+
User selects persona via dropdown + optional custom text box.
|
| 35 |
+
|
| 36 |
+
Implemented modes (exact names):
|
| 37 |
+
|
| 38 |
+
1. **Friendly Explainer** → Current default (two friends casually discussing)
|
| 39 |
+
2. **Academic Debate** → One defends the paper, the other politely challenges ("This claim is strong, but Table 2 baseline seems weak...")
|
| 40 |
+
3. **Savage Roast** → One speaker brutally roasts the paper ("This ablation is an absolute clown show", "Figure 4 is statistical noise"), the other stubbornly defends it
|
| 41 |
+
4. **Pedagogical** → Speaker A = Professor, Speaker B = Curious Student (student constantly asks questions)
|
| 42 |
+
5. **Interdisciplinary Clash** → Speaker A = Domain Expert, Speaker B = Complete Outsider (e.g. biologist reading ML paper → "This neuron analogy makes zero biological sense")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
### 2. Paper Auto-Discovery (PAD) — Killer Feature #2
|
| 46 |
+
|
| 47 |
+
Input methods:
|
| 48 |
+
- PDF upload
|
| 49 |
+
- Direct URL (arXiv, Semantic Scholar, HF, etc.)
|
| 50 |
+
(NEW INPUT METHOD) - Free text query → "Grok reasons about everything" or "diffusion survey 2025"
|
| 51 |
+
|
| 52 |
+
Workflow:
|
| 53 |
+
1. Agent calls **Semantic Scholar Graph v1 API** (`/paper/search?query=...&fields=title,authors,year,abstract,openAccessPdf,url`)
|
| 54 |
+
2. Parallel call to **arXiv API** (`http://export.arxiv.org/api/query?search_query=...`)
|
| 55 |
+
3. Collect top 5 results → show user title + abstract + year + source in gr.Radio or clickable cards
|
| 56 |
+
4. User selects → if openAccessPdf exists → download directly → Marker MCP extract
|
| 57 |
+
5. Otherwise fetch from arXiv
|
| 58 |
+
|
| 59 |
+
Zero friction paper discovery.
|
| 60 |
+
|
| 61 |
+
### 3. Paper Visual Framework (PVF) — Killer Feature #3 (Jury will lose their minds)
|
| 62 |
+
Right column of Gradio interface shows embedded PDF viewer (PDF.js).
|
| 63 |
+
|
| 64 |
+
- Marker provides exact page numbers for figures/tables
|
| 65 |
+
- When speakers say "Let's look at Figure 8" → PDF auto-scrolls to correct page + highlights/zooms the figure
|
| 66 |
+
- Transcript entries become clickable timestamps that jump to the exact location
|
| 67 |
+
- Implementation: ElevenLabs streaming → parse chunk for figure/table mentions → emit JS event → PDF.js control
|
| 68 |
+
|
| 69 |
+
This single feature wins "Best UX" + "Most Innovative" categories alone.
|
| 70 |
+
|
| 71 |
+
### 4. Counterfactual Paper Mode ("What If?")
|
| 72 |
+
Post-podcast button:
|
| 73 |
+
"What if this paper was written by Yann LeCun? / in 2012? / if GPT-4 never existed? / by DeepMind instead of OpenAI?"
|
| 74 |
+
|
| 75 |
+
→ Claude re-writes/re-interprets the same paper in alternate reality → new podcast generated.
|
| 76 |
+
Extremely fun, extremely memorable, extremely shareable.
|
| 77 |
+
|
| 78 |
+
### 5. Ultra Transcript System
|
| 79 |
+
- Timestamped (00:00:12)
|
| 80 |
+
- Speaker-labeled (Savage Critic:, Professor:, etc.)
|
| 81 |
+
- Clickable figure/table references (syncs with PVF)
|
| 82 |
+
- LaTeX equations rendered via MathJax
|
| 83 |
+
- Download buttons: .txt, .srt, .docx, .vtt
|
| 84 |
+
- Bonus: "Copy as tweet" → auto-selects the 3 spiciest quotes with citation
|
| 85 |
+
|
| 86 |
+
## Final UI Layout (Gradio 6)
|
| 87 |
+
```python
|
| 88 |
+
with gr.Row():
|
| 89 |
+
with gr.Column(scale=3):
|
| 90 |
+
chatbot = gr.Chatbot(height=700, render=True)
|
| 91 |
+
controls = gr.Row() # query input + PPF dropdown + custom persona + buttons
|
| 92 |
+
audio_player = gr.Audio(autoplay=True, streaming=True)
|
| 93 |
+
transcript = gr.Markdown()
|
| 94 |
+
with gr.Column(scale=2):
|
| 95 |
+
pdf_viewer = gr.HTML() # PVF - embedded PDF.js
|
| 96 |
+
timeline_vis = gr.HTML() # PET timeline
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
Required MCP Tools
|
| 100 |
+
|
| 101 |
+
extract_paper_with_marker → returns markdown string
|
| 102 |
+
search_semantic_scholar → returns json
|
| 103 |
+
search_arxiv → returns json
|
| 104 |
+
fetch_pdf_from_url → returns bytes
|
| 105 |
+
batch_extract_papers (for PET)
|
utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Helper functions for PaperCast"""
|
utils/config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# Load environment variables from .env.local
|
| 6 |
+
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env.local"))
|
| 7 |
+
|
| 8 |
+
# Demo Mode Configuration - Load from environment variable
|
| 9 |
+
# Set DEMO_MODE=true in .env.local or HuggingFace Spaces secrets
|
| 10 |
+
DEMO_MODE = True
|
| 11 |
+
|
| 12 |
+
# Model Configurations
|
| 13 |
+
SCRIPT_GENERATION_MODEL = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
|
| 14 |
+
|
| 15 |
+
# LLM API Inference Settings (Cloud GPU) - Load from .env.local
|
| 16 |
+
INFERENCE_API_URL = os.getenv("DEMO_INFERENCE_URL")
|
| 17 |
+
INFERENCE_API_KEY = os.getenv("DEMO_INFERENCE_KEY")
|
| 18 |
+
|
| 19 |
+
# TTS API Settings (ElevenLabs)
|
| 20 |
+
|
| 21 |
+
# Load from .env.local
|
| 22 |
+
ELEVENLABS_API_KEY = os.getenv("DEMO_TTS_KEY")
|
| 23 |
+
|
| 24 |
+
# ElevenLabs Voice IDs (you can change these to different voices)
|
| 25 |
+
# Find more voices at: https://api.elevenlabs.io/v1/voices
|
| 26 |
+
ELEVENLABS_HOST_VOICE = "ErXwobaYiN019PkySvjV" # Antoni - male voice for Host
|
| 27 |
+
ELEVENLABS_GUEST_VOICE = "EXAVITQu4vr4xnSDxMaL" # Bella - female voice for Guest
|
| 28 |
+
|
| 29 |
+
# Demo Mode Settings (loaded from .env.local)
|
| 30 |
+
DEMO_INFERENCE_URL = INFERENCE_API_URL
|
| 31 |
+
DEMO_INFERENCE_KEY = INFERENCE_API_KEY
|
| 32 |
+
DEMO_MODEL = SCRIPT_GENERATION_MODEL
|
| 33 |
+
DEMO_TTS_KEY = ELEVENLABS_API_KEY
|
| 34 |
+
|
| 35 |
+
# Optional: Additional API keys for non-demo mode
|
| 36 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 37 |
+
CUSTOM_ELEVENLABS_KEY = os.getenv("CUSTOM_ELEVENLABS_KEY", "")
|
| 38 |
+
CUSTOM_INFERENCE_URL = os.getenv("CUSTOM_INFERENCE_URL", "")
|
| 39 |
+
CUSTOM_INFERENCE_KEY = os.getenv("CUSTOM_INFERENCE_KEY", "")
|
| 40 |
+
|
| 41 |
+
# Paths
|
| 42 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 43 |
+
TEMP_DIR = os.path.join(BASE_DIR, "temp")
|
| 44 |
+
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
| 45 |
+
|
| 46 |
+
# Ensure directories exist
|
| 47 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 48 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
# Generation Settings
|
| 51 |
+
MAX_TOKENS = 4096 # Supports long-form content generation
|
| 52 |
+
TEMPERATURE = 0.7
|
| 53 |
+
|
| 54 |
+
# Context Limits for Multi-Paper Processing
|
| 55 |
+
MAX_CONTEXT_CHARS = 80000 # Maximum total characters for multiple papers (~60K tokens)
|
| 56 |
+
# This ensures we stay well within the 128K token limit while leaving room for prompts and responses
|
utils/history.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from utils.config import OUTPUT_DIR
|
| 5 |
+
|
| 6 |
+
HISTORY_FILE = os.path.join(OUTPUT_DIR, "history.json")
|
| 7 |
+
|
| 8 |
+
def load_history():
|
| 9 |
+
"""Load podcast generation history from JSON file"""
|
| 10 |
+
if not os.path.exists(HISTORY_FILE):
|
| 11 |
+
return []
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
with open(HISTORY_FILE, 'r') as f:
|
| 15 |
+
return json.load(f)
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"Error loading history: {e}")
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
def save_to_history(url, audio_path, script_length):
|
| 21 |
+
"""Save a podcast generation to history"""
|
| 22 |
+
history = load_history()
|
| 23 |
+
|
| 24 |
+
entry = {
|
| 25 |
+
"url": url,
|
| 26 |
+
"audio_path": audio_path,
|
| 27 |
+
"script_length": script_length,
|
| 28 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 29 |
+
"audio_filename": os.path.basename(audio_path)
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
history.append(entry)
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
with open(HISTORY_FILE, 'w') as f:
|
| 36 |
+
json.dump(history, f, indent=2)
|
| 37 |
+
print(f"✓ Saved to history: {url}")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error saving to history: {e}")
|
| 40 |
+
|
| 41 |
+
def get_history_items():
|
| 42 |
+
"""Get history items formatted for Gradio display"""
|
| 43 |
+
history = load_history()
|
| 44 |
+
|
| 45 |
+
if not history:
|
| 46 |
+
return []
|
| 47 |
+
|
| 48 |
+
# Return in reverse order (newest first)
|
| 49 |
+
items = []
|
| 50 |
+
for entry in reversed(history):
|
| 51 |
+
items.append({
|
| 52 |
+
"timestamp": entry["timestamp"],
|
| 53 |
+
"url": entry["url"],
|
| 54 |
+
"audio_path": entry["audio_path"],
|
| 55 |
+
"script_length": entry.get("script_length", "N/A")
|
| 56 |
+
})
|
| 57 |
+
|
| 58 |
+
return items
|