Phoneme_Detection_Leaderboard

Running

App Files Files Community

lataon commited on Oct 3

Commit

45089ef

1 Parent(s): 6c9fd42

update: new ds, modes

Browse files

Files changed (14) hide show

constants.py +40 -0
eval-results/results_1759378937_HuBERT-Base.json +0 -17
eval-results/results_1759378937_HuBERT-fine-tuned.json +0 -17
eval-results/results_1759378937_Timit.json +0 -17
eval-results/results_1759378937_Whisper.json +0 -17
eval-results/results_1759479712_HuBERT-Base.json +29 -0
eval-results/results_1759479712_HuBERT-fine-tuned.json +29 -0
eval-results/results_1759479712_LJSpeech-Gruut.json +29 -0
eval-results/results_1759479712_Timit.json +29 -0
eval-results/results_1759479712_WavLM.json +29 -0
eval-results/results_1759479712_Whisper.json +29 -0
phoneme_eval.py +147 -39
test_basic.py +0 -115
utils/load_model.py +58 -3

constants.py CHANGED Viewed

@@ -85,3 +85,43 @@ LEADERBOARD_CSS = """
     background-color: var(--table-row-focus);
 }
 """

     background-color: var(--table-row-focus);
 }
 """
+DATASETS = [
+    {
+        "name": "mirfan899/phoneme_asr",
+        "split": "train",
+        "field": "phonetic",
+        "max_samples": 500,  # Limit to 1000 samples
+        "use_streaming": False
+    },
+    {
+        "name": "mirfan899/kids_phoneme_md",
+        "split": "train",
+        "field": "phonetic",
+        "max_samples": 500,
+        "use_streaming": False
+    },
+    {
+        "name": "kylelovesllms/timit_asr_ipa",
+        "split": "train",
+        "field": "text",
+        "max_samples": 500,  # Smaller limit for this dataset
+        "use_streaming": False
+    },
+    {
+        "name": "openslr/librispeech_asr",
+        "split": "test.clean",  # Use full split with streaming
+        "field": "text",
+        "max_samples": 500,  # Larger dataset, allow more samples
+        "use_streaming": True  # Use streaming for better runtime
+    },
+    {
+        "name": "leduckhai/MultiMed",
+        "split": "test",
+        "field": "text",
+        "max_samples": 1500,
+        "config": "English",  # Fixed: add config name for English
+        "use_streaming": True  # Use streaming for better runtime
+    }
+]

eval-results/results_1759378937_HuBERT-Base.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  "config": {
-    "model_name": "HuBERT-Base",
-    "model_dtype": "float32",
-    "model_sha": ""
-  },
-  "results": {
-    "phoneme_asr": {
-      "per": 79.85359813133437,
-      "avg_duration": 0.7736877918243408
-    },
-    "kids_phoneme_md": {
-      "per": 71.85295670319688,
-      "avg_duration": 1.47061448097229
-    }
-  }
-}

eval-results/results_1759378937_HuBERT-fine-tuned.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  "config": {
-    "model_name": "HuBERT-fine-tuned",
-    "model_dtype": "float32",
-    "model_sha": ""
-  },
-  "results": {
-    "phoneme_asr": {
-      "per": 2.774112645808511,
-      "avg_duration": 0.7994948387145996
-    },
-    "kids_phoneme_md": {
-      "per": 12.210125572986708,
-      "avg_duration": 1.439890170097351
-    }
-  }
-}

eval-results/results_1759378937_Timit.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  "config": {
-    "model_name": "Timit",
-    "model_dtype": "float32",
-    "model_sha": ""
-  },
-  "results": {
-    "phoneme_asr": {
-      "per": 36.477283094931195,
-      "avg_duration": 0.8033712863922119
-    },
-    "kids_phoneme_md": {
-      "per": 40.59831492610759,
-      "avg_duration": 1.455029034614563
-    }
-  }
-}

eval-results/results_1759378937_Whisper.json DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  "config": {
-    "model_name": "Whisper",
-    "model_dtype": "float32",
-    "model_sha": ""
-  },
-  "results": {
-    "phoneme_asr": {
-      "per": 80.66478307042628,
-      "avg_duration": 1.2233323097229003
-    },
-    "kids_phoneme_md": {
-      "per": 72.25186973830769,
-      "avg_duration": 1.3742226600646972
-    }
-  }
-}

eval-results/results_1759479712_HuBERT-Base.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "HuBERT-Base",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 80.73712068409569,
+      "avg_duration": 1.006052589416504
+    },
+    "kids_phoneme_md": {
+      "per": 74.8274712307235,
+      "avg_duration": 1.4053531885147095
+    },
+    "timit_asr_ipa": {
+      "per": 79.21011611385504,
+      "avg_duration": 0.8184992551803589
+    },
+    "librispeech_asr": {
+      "per": 81.8414587948362,
+      "avg_duration": 2.6552599668502808
+    },
+    "MultiMed": {
+      "per": 86.31836686921642,
+      "avg_duration": 2.520846700668335
+    }
+  }
+}

eval-results/results_1759479712_HuBERT-fine-tuned.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "HuBERT-fine-tuned",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 3.1765040500162365,
+      "avg_duration": 1.0928319931030273
+    },
+    "kids_phoneme_md": {
+      "per": 13.847118841760139,
+      "avg_duration": 1.43447744846344
+    },
+    "timit_asr_ipa": {
+      "per": 3.5624700539646397,
+      "avg_duration": 0.8138290405273437
+    },
+    "librispeech_asr": {
+      "per": 2.1361935038679745,
+      "avg_duration": 2.591994023323059
+    },
+    "MultiMed": {
+      "per": 12.195454796657222,
+      "avg_duration": 2.4015810966491697
+    }
+  }
+}

eval-results/results_1759479712_LJSpeech-Gruut.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "LJSpeech-Gruut",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 28.34934978626287,
+      "avg_duration": 0.3894784927368164
+    },
+    "kids_phoneme_md": {
+      "per": 62.007568280756246,
+      "avg_duration": 0.5734055519104004
+    },
+    "timit_asr_ipa": {
+      "per": 24.322912970242964,
+      "avg_duration": 0.3130455732345581
+    },
+    "librispeech_asr": {
+      "per": 21.098893815003613,
+      "avg_duration": 1.034156036376953
+    },
+    "MultiMed": {
+      "per": 37.90138577574676,
+      "avg_duration": 1.0464757680892944
+    }
+  }
+}

eval-results/results_1759479712_Timit.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "Timit",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 32.78310772297904,
+      "avg_duration": 1.0769179582595825
+    },
+    "kids_phoneme_md": {
+      "per": 42.393439204382865,
+      "avg_duration": 1.4808897733688355
+    },
+    "timit_asr_ipa": {
+      "per": 28.852864777541704,
+      "avg_duration": 0.8038362503051758
+    },
+    "librispeech_asr": {
+      "per": 28.88432664616071,
+      "avg_duration": 2.5855883836746214
+    },
+    "MultiMed": {
+      "per": 42.29417929178023,
+      "avg_duration": 2.4689067125320436
+    }
+  }
+}

eval-results/results_1759479712_WavLM.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "WavLM",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 25.04219454527341,
+      "avg_duration": 1.054517960548401
+    },
+    "kids_phoneme_md": {
+      "per": 63.40875812391994,
+      "avg_duration": 1.476344680786133
+    },
+    "timit_asr_ipa": {
+      "per": 22.821457511149568,
+      "avg_duration": 0.7534051895141601
+    },
+    "librispeech_asr": {
+      "per": 36.13438162282092,
+      "avg_duration": 2.5621693611145018
+    },
+    "MultiMed": {
+      "per": 57.01443813462704,
+      "avg_duration": 2.337135744094849
+    }
+  }
+}

eval-results/results_1759479712_Whisper.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "config": {
+    "model_name": "Whisper",
+    "model_dtype": "float32",
+    "model_sha": ""
+  },
+  "results": {
+    "phoneme_asr": {
+      "per": 83.44842270480702,
+      "avg_duration": 1.5802977561950684
+    },
+    "kids_phoneme_md": {
+      "per": 73.97112058868787,
+      "avg_duration": 1.4796640157699585
+    },
+    "timit_asr_ipa": {
+      "per": 78.25013458573484,
+      "avg_duration": 1.2946593046188355
+    },
+    "librispeech_asr": {
+      "per": 82.02327697665437,
+      "avg_duration": 1.9603740453720093
+    },
+    "MultiMed": {
+      "per": 77.10185035170976,
+      "avg_duration": 1.68308687210083
+    }
+  }
+}

phoneme_eval.py CHANGED Viewed

@@ -1,7 +1,20 @@
 import pandas as pd
-from utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
 from utils.audio_process import calculate_error_rate, load_audio
-from utils.cmu_process import clean_cmu, cmu_to_ipa
 def set_output(model, pre_pho, ref_pho, duration, per, score):
     return {
@@ -13,14 +26,6 @@ def set_output(model, pre_pho, ref_pho, duration, per, score):
         "score": score
     }
-# Map model names to their runner functions
-MODEL_RUNNERS = {
-    "HuBERT-Base": run_hubert_base,
-    "Whisper": run_whisper,
-    "HuBERT fine-tuned": run_model,
-    "Timit": run_timit
-}
 def get_output(model, wav, reference_phoneme):
     """
     Run the given model, compute error rate, and return formatted output.
@@ -50,6 +55,8 @@ def benchmark_all(example):
         get_output("Whisper", wav, reference_phoneme),
         get_output("HuBERT fine-tuned", wav, reference_phoneme),
         get_output("Timit", wav, reference_phoneme),
     ]
     return pd.DataFrame(results)
@@ -75,25 +82,79 @@ def benchmark_dataset(dataset):
     return full_df, avg_stats
-from datasets import load_dataset, Audio
-DATASET_LIST = [
-    "mirfan899/phoneme_asr",
-    "mirfan899/kids_phoneme_md",
-]
 def main():
-    field = "phonetic"
-    # Collect per-model metrics across datasets
     per_model_results = {}
-    for dataset_name in DATASET_LIST:
-        try:
-            dataset = load_dataset(dataset_name, split="train")
-        except Exception as e:
-            print(f"[warn] skip dataset {dataset_name}: {e}")
             continue
         try:
@@ -101,27 +162,74 @@ def main():
         except Exception:
             pass
-        unique_texts = dataset.unique(field)
-        print("Unique phonetic strings (", dataset_name, "):", len(unique_texts))
-        dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)
-        def is_valid(example):
-            phoneme_tokens = example[field].split()
-            return len(phoneme_tokens) >= 10
-        dataset_filtered = dataset_unique.filter(is_valid)
-        dataset_final = dataset_filtered.shuffle(seed=42).select(range(min(100, len(dataset_filtered))))
         print(dataset_final)
         print("Final size:", len(dataset_final))
-        full_results, avg_stats = benchmark_dataset(dataset_final.select(range(min(10, len(dataset_final)))))
-        print("Average Statistic per model (", dataset_name, "):")
         print(avg_stats)
         # Use dataset name as key (extract the actual name part)
-        dataset_key = dataset_name.split("/")[-1]  # Get the last part after the slash
         for _, row in avg_stats.iterrows():
             model_name = str(row["model"]).replace(" ", "-")
             per = float(row["Average PER"]) if row["Average PER"] is not None else None

 import pandas as pd
+from utils.load_model import run_hubert_base, run_whisper, run_model, run_timit, run_wavlm_large_phoneme, run_gruut
 from utils.audio_process import calculate_error_rate, load_audio
+from utils.cmu_process import clean_cmu, cmu_to_ipa, text_to_phoneme
+from constants import DATASETS
+from datasets import load_dataset, Audio
+import argparse
+# Map model names to their runner functions
+MODEL_RUNNERS = {
+    "HuBERT-Base": run_hubert_base,
+    "Whisper": run_whisper,
+    "HuBERT fine-tuned": run_model,
+    "Timit": run_timit,
+    "speech31/wavlm-large-english-phoneme": run_wavlm_large_phoneme,
+    "bookbot/wav2vec2-ljspeech-gruut": run_gruut,
+}
 def set_output(model, pre_pho, ref_pho, duration, per, score):
     return {
         "score": score
     }
 def get_output(model, wav, reference_phoneme):
     """
     Run the given model, compute error rate, and return formatted output.
         get_output("Whisper", wav, reference_phoneme),
         get_output("HuBERT fine-tuned", wav, reference_phoneme),
         get_output("Timit", wav, reference_phoneme),
+        get_output("WavLM", wav, reference_phoneme),
+        get_output("LJSpeech Gruut", wav, reference_phoneme),
     ]
     return pd.DataFrame(results)
     return full_df, avg_stats
+def load_dataset_with_limits(dataset_config, max_samples=None, use_streaming=False):
+    """
+    Load a dataset with optional size limits and streaming.
+    Args:
+        dataset_config: Dictionary containing dataset configuration
+        max_samples: Maximum number of samples to load (None for no limit)
+        use_streaming: Whether to use streaming for large datasets
+    Returns:
+        Dataset object
+    """
+    try:
+        # Prepare load_dataset arguments
+        load_args = {
+            "path": dataset_config["name"],
+            "split": dataset_config["split"]
+        }
+        # Add config if specified
+        if "config" in dataset_config:
+            load_args["name"] = dataset_config["config"]
+        # Add streaming if requested
+        if use_streaming:
+            load_args["streaming"] = True
+            print(f"Loading {dataset_config['name']} with streaming...")
+        else:
+            print(f"Loading {dataset_config['name']}...")
+        dataset = load_dataset(**load_args)
+        # Apply size limits
+        if max_samples is not None:
+            print(f"Limiting dataset to {max_samples} samples...")
+            if use_streaming:
+                dataset = dataset.take(max_samples)
+            else:
+                dataset = dataset.select(range(min(max_samples, len(dataset))))
+        return dataset
+    except Exception as e:
+        print(f"[warn] skip dataset {dataset_config['name']}: {e}")
+        return None
 def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Phoneme Detection Evaluation')
+    parser.add_argument('--max-samples', type=int, default=None,
+                         help='Override max_samples for all datasets')
+    parser.add_argument('--dataset', type=str, default=None,
+                         help='Process only specific dataset (by name)')
+    args = parser.parse_args()
     per_model_results = {}
+    for dataset_config in DATASETS:
+        # Skip dataset if specific dataset is requested and this isn't it
+        if args.dataset and args.dataset not in dataset_config["name"]:
+            continue
+        # Override max_samples if provided via command line
+        max_samples = args.max_samples if args.max_samples is not None else dataset_config.get("max_samples")
+        use_streaming = dataset_config.get("use_streaming", False)
+        # Load dataset with limits
+        dataset = load_dataset_with_limits(
+            dataset_config,
+            max_samples=max_samples,
+            use_streaming=use_streaming
+        )
+        if dataset is None:
             continue
         try:
         except Exception:
             pass
+        field = dataset_config["field"]
+        # Handle streaming datasets differently
+        if use_streaming:
+            print("Processing streaming dataset...")
+            valid_samples = []
+            # Set a reasonable limit for streaming (max 100 samples)
+            streaming_limit = max(max_samples or 100, 100)
+            for example in dataset:
+                # Convert text to phonemes if needed
+                if field == "text":
+                    phonetic_text = text_to_phoneme(example[field])
+                    example = {**example, "phonetic": phonetic_text}
+                    current_field = "phonetic"
+                else:
+                    current_field = field
+                # Check if valid
+                if current_field in example:
+                    phoneme_tokens = example[current_field].split()
+                    if len(phoneme_tokens) >= 10:
+                        valid_samples.append(example)
+                        # Stop when we reach the streaming limit
+                        if len(valid_samples) >= streaming_limit:
+                            break
+            print(f"Found {len(valid_samples)} valid samples")
+            if len(valid_samples) == 0:
+                print("No valid samples found, skipping dataset")
+                continue
+            # Convert to regular dataset for processing
+            from datasets import Dataset
+            dataset_final = Dataset.from_list(valid_samples)
+            field = "phonetic" if field == "text" else field
+        else:
+            # Regular dataset processing
+            if field == "text":
+                dataset = dataset.map(lambda x: {"phonetic": text_to_phoneme(x[field])})
+                field = "phonetic"
+            unique_texts = dataset.unique(field)
+            print("Unique phonetic strings (", dataset_config["name"], "):", len(unique_texts))
+            dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)
+            def is_valid(example):
+                phoneme_tokens = example[field].split()
+                return len(phoneme_tokens) >= 10
+            dataset_filtered = dataset_unique.filter(is_valid)
+            # Use smaller final size for evaluation
+            final_size = min(100, len(dataset_filtered))
+            dataset_final = dataset_filtered.shuffle(seed=42).select(range(final_size))
         print(dataset_final)
         print("Final size:", len(dataset_final))
+        # Limit to 10 samples for benchmarking
+        benchmark_size = min(10, len(dataset_final))
+        full_results, avg_stats = benchmark_dataset(dataset_final.select(range(benchmark_size)))
+        print("Average Statistic per model (", dataset_config["name"], "):")
         print(avg_stats)
         # Use dataset name as key (extract the actual name part)
+        dataset_key = dataset_config["name"].split("/")[-1]  # Get the last part after the slash
         for _, row in avg_stats.iterrows():
             model_name = str(row["model"]).replace(" ", "-")
             per = float(row["Average PER"]) if row["Average PER"] is not None else None

test_basic.py DELETED Viewed

@@ -1,115 +0,0 @@
-#!/usr/bin/env python3
-"""
-Basic test to verify the cleaned up phoneme detection leaderboard functionality.
-"""
-import os
-import sys
-import json
-import tempfile
-import pandas as pd
-# Add current directory to path
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-def test_imports():
-    """Test that all modules can be imported"""
-    try:
-        from constants import BANNER, INTRODUCTION_TEXT
-        from utils_display import PhonemeEvalColumn, make_clickable_model
-        from init import is_model_on_hub
-        print("All imports successful")
-        return True
-    except ImportError as e:
-        print(f"Import error: {e}")
-        return False
-def test_data_loading():
-    """Test that the app can load data from eval-results directory"""
-    try:
-        from app import load_results, EVAL_RESULTS_DIR
-        # Create a temporary test result
-        os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)
-        test_result = {
-            "config": {
-                "model_name": "test/model",
-                "model_dtype": "float32",
-                "model_sha": "test123"
-            },
-            "results": {
-                "phoneme_asr": {"per": 15.5, "avg_duration": 0.1},
-                "kids_phoneme_md": {"per": 18.2, "avg_duration": 0.12}
-            }
-        }
-        test_file = os.path.join(EVAL_RESULTS_DIR, "test_results.json")
-        with open(test_file, "w") as f:
-            json.dump(test_result, f)
-        # Test loading
-        df = load_results(EVAL_RESULTS_DIR)
-        print(f"Data loading successful, found {len(df)} rows")
-        # Clean up
-        os.remove(test_file)
-        return True
-    except Exception as e:
-        print(f"Data loading error: {e}")
-        return False
-def test_utils():
-    """Test utility functions"""
-    try:
-        from utils_display import make_clickable_model, styled_error, styled_message
-        # Test model link generation
-        link = make_clickable_model("facebook/hubert-base")
-        assert "facebook/hubert-base" in link
-        assert "href=" in link
-        # Test styled messages
-        error_msg = styled_error("Test error")
-        assert "red" in error_msg
-        success_msg = styled_message("Test success")
-        assert "green" in success_msg
-        print("Utility functions working")
-        return True
-    except Exception as e:
-        print(f"Utility test error: {e}")
-        return False
-def main():
-    """Run all tests"""
-    print("Testing Phoneme Detection Leaderboard...")
-    tests = [
-        test_imports,
-        test_data_loading,
-        test_utils
-    ]
-    passed = 0
-    total = len(tests)
-    for test in tests:
-        if test():
-            passed += 1
-        print()
-    print(f"Test Results: {passed}/{total} tests passed")
-    if passed == total:
-        print("All tests passed! The cleaned up version is working correctly.")
-        return True
-    else:
-        print("Some tests failed. Please check the errors above.")
-        return False
-if __name__ == "__main__":
-    success = main()
-    sys.exit(0 if success else 1)

utils/load_model.py CHANGED Viewed

@@ -4,11 +4,9 @@ import torch
 import torchaudio
 from transformers import (
     Wav2Vec2Processor, HubertForCTC,
-    WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC
 )
 from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
-from .cmu_process import clean_cmu
-from .cmu_process import cmu_to_ipa
 from dotenv import load_dotenv
@@ -48,6 +46,16 @@ model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKE
 timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
 # === Inference functions ===
 def run_hubert_base(wav):
@@ -116,3 +124,50 @@ def run_timit(wav):
     phonemes = "".join(phonemes)
     return phonemes.strip(), time.time() - start

 import torchaudio
 from transformers import (
     Wav2Vec2Processor, HubertForCTC,
+    WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC, AutoProcessor, AutoModelForCTC
 )
 from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
 from dotenv import load_dotenv
 timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
+# 5 bookbot/wav2vec2-ljspeech-gruut
+gruut_processor = AutoProcessor.from_pretrained("bookbot/wav2vec2-ljspeech-gruut")
+gruut_model = AutoModelForCTC.from_pretrained("bookbot/wav2vec2-ljspeech-gruut").to(device).eval()
+# 6 microsoft/wavlm-large-english-phoneme
+wavlm_proc = AutoProcessor.from_pretrained("speech31/wavlm-large-english-phoneme")
+wavlm_model = AutoModelForCTC.from_pretrained("speech31/wavlm-large-english-phoneme").to(device).eval()
 # === Inference functions ===
 def run_hubert_base(wav):
     phonemes = "".join(phonemes)
     return phonemes.strip(), time.time() - start
+def run_gruut(wav):
+    start = time.time()
+    # Preprocess waveform → model input
+    inputs = gruut_processor(
+        wav,
+        sampling_rate=16000,
+        return_tensors="pt",
+        padding=True
+    ).to(device)
+    # Forward pass
+    with torch.no_grad():
+        logits = gruut_model(**inputs).logits
+    # Greedy decode → IPA phonemes
+    pred_ids = torch.argmax(logits, dim=-1)
+    phonemes = gruut_processor.batch_decode(pred_ids)[0]
+    phonemes = "".join(phonemes)
+    return phonemes.strip(), time.time() - start
+def run_wavlm_large_phoneme(wav):
+    start = time.time()
+    # Preprocess waveform → model input
+    inputs = wavlm_proc(
+        wav,
+        sampling_rate=16000,
+        return_tensors="pt",
+        padding=True
+    ).to(device)
+    input_values = inputs.input_values
+    attention_mask = inputs.get("attention_mask", None)
+    # Forward pass
+    with torch.no_grad():
+        logits = wavlm_model(input_values, attention_mask=attention_mask).logits
+    # Greedy decode → phoneme tokens
+    pred_ids = torch.argmax(logits, dim=-1)
+    phonemes = wavlm_proc.batch_decode(pred_ids)[0]
+    phonemes = "".join(phonemes)
+    return phonemes.strip(), time.time() - start