#!/usr/bin/env python import os import pathlib import gradio as gr import librosa import spaces import torch from transformers import KyutaiSpeechToTextForConditionalGeneration, KyutaiSpeechToTextProcessor device = "cuda" if torch.cuda.is_available() else "cpu" model_id = "kyutai/stt-2.6b-en-trfs" model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=device, torch_dtype="auto") processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) SAMPLE_RATE = 24000 MAX_DURATION = int(os.getenv("MAX_DURATION", "60")) MAX_SAMPLE_SIZE = SAMPLE_RATE * MAX_DURATION @spaces.GPU def transcribe(audio_path: str) -> str: """Transcribe an English audio file to text. Args: audio_path (str): The path to the audio file. The audio must contain English speech. Returns: str: The transcription of the English audio file. """ if not audio_path: return "" data, _ = librosa.load(audio_path, sr=SAMPLE_RATE) if len(data) > MAX_SAMPLE_SIZE: data = data[:MAX_SAMPLE_SIZE] gr.Info(f"Audio file is too long. Truncating to {MAX_DURATION} seconds.") inputs = processor(data) inputs.to(device) output_tokens = model.generate(**inputs) output = processor.batch_decode(output_tokens, skip_special_tokens=True) return output[0] with gr.Blocks(fill_height=False) as demo: # Header gr.HTML("""

đŸŽ™ī¸ Kyutai Speech-to-Text

Advanced English Audio Transcription powered by AI

""") # Info banner gr.HTML(f"""
â„šī¸ Upload or record audio in English (max {MAX_DURATION} seconds). Supports WAV, MP3, and other common formats.
""") # noqa: RUF001 # Main content with gr.Group(elem_classes="main-card"): # Audio input audio = gr.Audio( label="đŸŽĩ Audio Input", type="filepath", sources=["upload", "microphone"], elem_classes="audio-container", ) # Transcribe button transcribe_btn = gr.Button( "✨ Transcribe Audio", variant="primary", size="lg", elem_classes="primary-button", ) # Output output = gr.Textbox( label="📝 Transcription", placeholder="Your transcription will appear here...", lines=6, max_lines=12, elem_classes="transcription-output", ) # Examples section with gr.Group(elem_classes="examples-container"): gr.Markdown("### 💡 Try These Examples") gr.Examples( examples=sorted(pathlib.Path("assets").glob("*.wav")) if pathlib.Path("assets").exists() else [], inputs=audio, outputs=output, fn=transcribe, examples_per_page=5, ) # Footer gr.HTML(""" """) # Event handlers transcribe_btn.click( fn=transcribe, inputs=audio, outputs=output, api_name="transcribe", ) if __name__ == "__main__": # Custom theme for modern, clean design theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="md", radius_size="lg", ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", block_label_text_weight="500", ) demo.launch(theme=theme, css_paths="style.css", mcp_server=True)