""" Supertonic TTS Integration for PaperCast CPU-based Text-to-Speech using Supertone/supertonic model. Provides an interface compatible with the main TTSEngine. """ import os import io import wave import numpy as np from typing import Iterator, Optional from huggingface_hub import snapshot_download # Voice mapping for Supertonic SUPERTONIC_VOICES = { "M1 (Male 1)": "M1", "M2 (Male 2)": "M2", "F1 (Female 1)": "F1", "F2 (Female 2)": "F2", } # Default voices for Host and Guest DEFAULT_HOST_VOICE = "M1" # Male voice for Host DEFAULT_GUEST_VOICE = "F1" # Female voice for Guest class SupertonicWrapper: """Wrapper for Supertonic TTS to integrate with PaperCast""" def __init__(self, assets_dir: Optional[str] = None, use_gpu: bool = False): """ Initialize Supertonic TTS Args: assets_dir: Path to assets directory (default: ./supertonic_assets) use_gpu: Whether to use GPU (default: False, CPU-only) """ self.use_gpu = use_gpu # Set assets directory in papercast project if assets_dir is None: project_root = os.path.dirname(os.path.dirname(__file__)) self.assets_dir = os.path.join(project_root, "supertonic_assets") else: self.assets_dir = assets_dir self.tts = None self._initialized = False print(f"Supertonic assets directory: {self.assets_dir}") def _ensure_models_downloaded(self): """Download models from HuggingFace if not present""" if not os.path.exists(self.assets_dir): print(f"Downloading Supertonic models to {self.assets_dir}...") print("This is a one-time download (~400MB)...") snapshot_download(repo_id="Supertone/supertonic", local_dir=self.assets_dir) print("Download complete.") def initialize(self): """Initialize the TTS model""" if self._initialized: return print("Initializing Supertonic TTS (CPU mode)...") self._ensure_models_downloaded() # Import helper functions (lazy import to avoid loading if not needed) try: from synthesis.supertonic_helper import load_text_to_speech onnx_dir = os.path.join(self.assets_dir, "onnx") self.tts = load_text_to_speech(onnx_dir, use_gpu=self.use_gpu) self._initialized = True print(f"✓ Supertonic TTS ready (CPU mode)") except ImportError as e: raise ImportError( f"Failed to import Supertonic helper functions: {e}\n" "Make sure required dependencies are installed (onnxruntime, soundfile)." ) def get_available_voices(self) -> list[str]: """Get list of available voice styles""" return list(SUPERTONIC_VOICES.keys()) def get_voice_id(self, voice_name: str) -> str: """Convert voice display name to voice ID""" return SUPERTONIC_VOICES.get(voice_name, DEFAULT_HOST_VOICE) def get_voice_path(self, voice_id: str) -> str: """Get the full path to a voice style file""" return os.path.join(self.assets_dir, "voice_styles", f"{voice_id}.json") @property def sample_rate(self) -> int: """Get the sample rate of the TTS model""" if not self._initialized: self.initialize() return self.tts.sample_rate def synthesize_chunk( self, text: str, voice_id: str, speed: float = 1.0, steps: int = 5, silence_duration: float = 0.3, max_len: int = 300 ) -> Iterator[np.ndarray]: """ Synthesize speech from text (streaming) Args: text: Input text to synthesize voice_id: Voice ID (M1, M2, F1, F2) speed: Speech speed multiplier (0.5-2.0) steps: Number of diffusion steps (1-50, lower=faster, higher=better quality) silence_duration: Duration of silence between chunks max_len: Maximum length of each chunk Yields: Audio chunks as numpy arrays (float32, [-1, 1]) """ if not self._initialized: self.initialize() # Import helper function from synthesis.supertonic_helper import load_voice_style voice_path = self.get_voice_path(voice_id) if not os.path.exists(voice_path): raise ValueError(f"Voice style '{voice_id}' not found at {voice_path}") style = load_voice_style([voice_path]) yield from self.tts.stream(text, style, steps, speed, silence_duration, max_len) @staticmethod def audio_to_int16(audio_np: np.ndarray) -> np.ndarray: """Convert float32 audio to int16""" audio_clipped = np.clip(audio_np, -1.0, 1.0) return (audio_clipped * 32767.0).astype(np.int16) @staticmethod def audio_to_wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: """Convert int16 audio to WAV bytes""" buffer = io.BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio_int16.tobytes()) return buffer.getvalue() def synthesize_to_audio_segment( self, text: str, voice_id: str, speed: float = 1.0, steps: int = 5, ): """ Synthesize speech and return as AudioSegment Args: text: Input text to synthesize voice_id: Voice ID (M1, M2, F1, F2) speed: Speech speed multiplier (0.5-2.0) steps: Number of diffusion steps (1-50) Returns: AudioSegment object """ from pydub import AudioSegment from io import BytesIO # Collect all chunks chunks = [] for audio_chunk in self.synthesize_chunk( text=text, voice_id=voice_id, speed=speed, steps=steps, silence_duration=0.3, max_len=300 ): chunks.append(audio_chunk) # Concatenate all chunks full_audio = np.concatenate(chunks) if chunks else np.array([], dtype=np.float32) # Convert to int16 and then to WAV bytes audio_int16 = self.audio_to_int16(full_audio) wav_bytes = self.audio_to_wav_bytes(audio_int16, self.sample_rate) # Convert to AudioSegment return AudioSegment.from_wav(BytesIO(wav_bytes)) # Global instance _supertonic_instance: Optional[SupertonicWrapper] = None def get_supertonic_engine() -> SupertonicWrapper: """ Get or create Supertonic TTS engine instance Returns: SupertonicWrapper instance """ global _supertonic_instance if _supertonic_instance is None: _supertonic_instance = SupertonicWrapper() return _supertonic_instance