"""
Supertonic TTS Integration for PaperCast

CPU-based Text-to-Speech using Supertone/supertonic model.
Provides an interface compatible with the main TTSEngine.
"""
import os
import io
import wave
import numpy as np
from typing import Iterator, Optional

from huggingface_hub import snapshot_download


# Voice mapping for Supertonic
SUPERTONIC_VOICES = {
    "M1 (Male 1)": "M1",
    "M2 (Male 2)": "M2",
    "F1 (Female 1)": "F1",
    "F2 (Female 2)": "F2",
}

# Default voices for Host and Guest
DEFAULT_HOST_VOICE = "M1"  # Male voice for Host
DEFAULT_GUEST_VOICE = "F1"  # Female voice for Guest


class SupertonicWrapper:
    """Wrapper for Supertonic TTS to integrate with PaperCast"""

    def __init__(self, assets_dir: Optional[str] = None, use_gpu: bool = False):
        """
        Initialize Supertonic TTS

        Args:
            assets_dir: Path to assets directory (default: ./supertonic_assets)
            use_gpu: Whether to use GPU (default: False, CPU-only)
        """
        self.use_gpu = use_gpu

        # Set assets directory in papercast project
        if assets_dir is None:
            project_root = os.path.dirname(os.path.dirname(__file__))
            self.assets_dir = os.path.join(project_root, "supertonic_assets")
        else:
            self.assets_dir = assets_dir

        self.tts = None
        self._initialized = False

        print(f"Supertonic assets directory: {self.assets_dir}")

    def _ensure_models_downloaded(self):
        """Download models from HuggingFace if not present"""
        if not os.path.exists(self.assets_dir):
            print(f"Downloading Supertonic models to {self.assets_dir}...")
            print("This is a one-time download (~400MB)...")
            snapshot_download(repo_id="Supertone/supertonic", local_dir=self.assets_dir)
            print("Download complete.")

    def initialize(self):
        """Initialize the TTS model"""
        if self._initialized:
            return

        print("Initializing Supertonic TTS (CPU mode)...")
        self._ensure_models_downloaded()

        # Import helper functions (lazy import to avoid loading if not needed)
        try:
            from synthesis.supertonic_helper import load_text_to_speech

            onnx_dir = os.path.join(self.assets_dir, "onnx")
            self.tts = load_text_to_speech(onnx_dir, use_gpu=self.use_gpu)
            self._initialized = True
            print(f"✓ Supertonic TTS ready (CPU mode)")

        except ImportError as e:
            raise ImportError(
                f"Failed to import Supertonic helper functions: {e}\n"
                "Make sure required dependencies are installed (onnxruntime, soundfile)."
            )

    def get_available_voices(self) -> list[str]:
        """Get list of available voice styles"""
        return list(SUPERTONIC_VOICES.keys())

    def get_voice_id(self, voice_name: str) -> str:
        """Convert voice display name to voice ID"""
        return SUPERTONIC_VOICES.get(voice_name, DEFAULT_HOST_VOICE)

    def get_voice_path(self, voice_id: str) -> str:
        """Get the full path to a voice style file"""
        return os.path.join(self.assets_dir, "voice_styles", f"{voice_id}.json")

    @property
    def sample_rate(self) -> int:
        """Get the sample rate of the TTS model"""
        if not self._initialized:
            self.initialize()
        return self.tts.sample_rate

    def synthesize_chunk(
        self,
        text: str,
        voice_id: str,
        speed: float = 1.0,
        steps: int = 5,
        silence_duration: float = 0.3,
        max_len: int = 300
    ) -> Iterator[np.ndarray]:
        """
        Synthesize speech from text (streaming)

        Args:
            text: Input text to synthesize
            voice_id: Voice ID (M1, M2, F1, F2)
            speed: Speech speed multiplier (0.5-2.0)
            steps: Number of diffusion steps (1-50, lower=faster, higher=better quality)
            silence_duration: Duration of silence between chunks
            max_len: Maximum length of each chunk

        Yields:
            Audio chunks as numpy arrays (float32, [-1, 1])
        """
        if not self._initialized:
            self.initialize()

        # Import helper function
        from synthesis.supertonic_helper import load_voice_style

        voice_path = self.get_voice_path(voice_id)
        if not os.path.exists(voice_path):
            raise ValueError(f"Voice style '{voice_id}' not found at {voice_path}")

        style = load_voice_style([voice_path])

        yield from self.tts.stream(text, style, steps, speed, silence_duration, max_len)

    @staticmethod
    def audio_to_int16(audio_np: np.ndarray) -> np.ndarray:
        """Convert float32 audio to int16"""
        audio_clipped = np.clip(audio_np, -1.0, 1.0)
        return (audio_clipped * 32767.0).astype(np.int16)

    @staticmethod
    def audio_to_wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes:
        """Convert int16 audio to WAV bytes"""
        buffer = io.BytesIO()
        with wave.open(buffer, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes(audio_int16.tobytes())
        return buffer.getvalue()

    def synthesize_to_audio_segment(
        self,
        text: str,
        voice_id: str,
        speed: float = 1.0,
        steps: int = 5,
    ):
        """
        Synthesize speech and return as AudioSegment

        Args:
            text: Input text to synthesize
            voice_id: Voice ID (M1, M2, F1, F2)
            speed: Speech speed multiplier (0.5-2.0)
            steps: Number of diffusion steps (1-50)

        Returns:
            AudioSegment object
        """
        from pydub import AudioSegment
        from io import BytesIO

        # Collect all chunks
        chunks = []
        for audio_chunk in self.synthesize_chunk(
            text=text,
            voice_id=voice_id,
            speed=speed,
            steps=steps,
            silence_duration=0.3,
            max_len=300
        ):
            chunks.append(audio_chunk)

        # Concatenate all chunks
        full_audio = np.concatenate(chunks) if chunks else np.array([], dtype=np.float32)

        # Convert to int16 and then to WAV bytes
        audio_int16 = self.audio_to_int16(full_audio)
        wav_bytes = self.audio_to_wav_bytes(audio_int16, self.sample_rate)

        # Convert to AudioSegment
        return AudioSegment.from_wav(BytesIO(wav_bytes))


# Global instance
_supertonic_instance: Optional[SupertonicWrapper] = None


def get_supertonic_engine() -> SupertonicWrapper:
    """
    Get or create Supertonic TTS engine instance

    Returns:
        SupertonicWrapper instance
    """
    global _supertonic_instance

    if _supertonic_instance is None:
        _supertonic_instance = SupertonicWrapper()

    return _supertonic_instance