Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import librosa | |
| import soundfile as sf | |
| import torch | |
| from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC | |
| #load wav2vec2 tokenizer and model | |
| # define speech-to-text function | |
| def asr_transcript(audio_file, language): | |
| if language == "English": | |
| model_name = "facebook/wav2vec2-large-960h-lv60-self" | |
| elif language == "Russian": | |
| model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-russian" | |
| elif language == "French": | |
| model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-french" | |
| tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) | |
| model = Wav2Vec2ForCTC.from_pretrained(model_name) | |
| transcript = "" | |
| # Stream over 20 seconds chunks | |
| stream = librosa.stream( | |
| audio_file.name, block_length=20, frame_length=16000, hop_length=16000 | |
| ) | |
| for speech in stream: | |
| if len(speech.shape) > 1: | |
| speech = speech[:, 0] + speech[:, 1] | |
| input_values = tokenizer(speech, return_tensors="pt").input_values | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = tokenizer.batch_decode(predicted_ids)[0] | |
| transcript += transcription.lower() + " " | |
| return transcript | |
| gradio_ui = gr.Interface( | |
| fn=asr_transcript, | |
| title="Automatic speech recognition with Wav2Vec2", | |
| description="Upload an audio clip in Russian, English, or French and let AI do the hard work of transcribing", | |
| inputs = [gr.inputs.Audio(label="Upload Audio File", type="file"), | |
| gr.inputs.Radio(label="Pick a language", | |
| choices=["English", | |
| "Russian", | |
| "French"])], | |
| outputs=gr.outputs.Textbox(label="Auto-Transcript"), | |
| ) | |
| gradio_ui.launch() |