lataon commited on
Commit
45089ef
·
1 Parent(s): 6c9fd42

update: new ds, modes

Browse files
constants.py CHANGED
@@ -85,3 +85,43 @@ LEADERBOARD_CSS = """
85
  background-color: var(--table-row-focus);
86
  }
87
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  background-color: var(--table-row-focus);
86
  }
87
  """
88
+
89
+
90
+ DATASETS = [
91
+ {
92
+ "name": "mirfan899/phoneme_asr",
93
+ "split": "train",
94
+ "field": "phonetic",
95
+ "max_samples": 500, # Limit to 1000 samples
96
+ "use_streaming": False
97
+ },
98
+ {
99
+ "name": "mirfan899/kids_phoneme_md",
100
+ "split": "train",
101
+ "field": "phonetic",
102
+ "max_samples": 500,
103
+ "use_streaming": False
104
+ },
105
+ {
106
+ "name": "kylelovesllms/timit_asr_ipa",
107
+ "split": "train",
108
+ "field": "text",
109
+ "max_samples": 500, # Smaller limit for this dataset
110
+ "use_streaming": False
111
+ },
112
+ {
113
+ "name": "openslr/librispeech_asr",
114
+ "split": "test.clean", # Use full split with streaming
115
+ "field": "text",
116
+ "max_samples": 500, # Larger dataset, allow more samples
117
+ "use_streaming": True # Use streaming for better runtime
118
+ },
119
+ {
120
+ "name": "leduckhai/MultiMed",
121
+ "split": "test",
122
+ "field": "text",
123
+ "max_samples": 1500,
124
+ "config": "English", # Fixed: add config name for English
125
+ "use_streaming": True # Use streaming for better runtime
126
+ }
127
+ ]
eval-results/results_1759378937_HuBERT-Base.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "HuBERT-Base",
4
- "model_dtype": "float32",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "phoneme_asr": {
9
- "per": 79.85359813133437,
10
- "avg_duration": 0.7736877918243408
11
- },
12
- "kids_phoneme_md": {
13
- "per": 71.85295670319688,
14
- "avg_duration": 1.47061448097229
15
- }
16
- }
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/results_1759378937_HuBERT-fine-tuned.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "HuBERT-fine-tuned",
4
- "model_dtype": "float32",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "phoneme_asr": {
9
- "per": 2.774112645808511,
10
- "avg_duration": 0.7994948387145996
11
- },
12
- "kids_phoneme_md": {
13
- "per": 12.210125572986708,
14
- "avg_duration": 1.439890170097351
15
- }
16
- }
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/results_1759378937_Timit.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "Timit",
4
- "model_dtype": "float32",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "phoneme_asr": {
9
- "per": 36.477283094931195,
10
- "avg_duration": 0.8033712863922119
11
- },
12
- "kids_phoneme_md": {
13
- "per": 40.59831492610759,
14
- "avg_duration": 1.455029034614563
15
- }
16
- }
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/results_1759378937_Whisper.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "config": {
3
- "model_name": "Whisper",
4
- "model_dtype": "float32",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "phoneme_asr": {
9
- "per": 80.66478307042628,
10
- "avg_duration": 1.2233323097229003
11
- },
12
- "kids_phoneme_md": {
13
- "per": 72.25186973830769,
14
- "avg_duration": 1.3742226600646972
15
- }
16
- }
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/results_1759479712_HuBERT-Base.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "HuBERT-Base",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 80.73712068409569,
10
+ "avg_duration": 1.006052589416504
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 74.8274712307235,
14
+ "avg_duration": 1.4053531885147095
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 79.21011611385504,
18
+ "avg_duration": 0.8184992551803589
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 81.8414587948362,
22
+ "avg_duration": 2.6552599668502808
23
+ },
24
+ "MultiMed": {
25
+ "per": 86.31836686921642,
26
+ "avg_duration": 2.520846700668335
27
+ }
28
+ }
29
+ }
eval-results/results_1759479712_HuBERT-fine-tuned.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "HuBERT-fine-tuned",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 3.1765040500162365,
10
+ "avg_duration": 1.0928319931030273
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 13.847118841760139,
14
+ "avg_duration": 1.43447744846344
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 3.5624700539646397,
18
+ "avg_duration": 0.8138290405273437
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 2.1361935038679745,
22
+ "avg_duration": 2.591994023323059
23
+ },
24
+ "MultiMed": {
25
+ "per": 12.195454796657222,
26
+ "avg_duration": 2.4015810966491697
27
+ }
28
+ }
29
+ }
eval-results/results_1759479712_LJSpeech-Gruut.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "LJSpeech-Gruut",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 28.34934978626287,
10
+ "avg_duration": 0.3894784927368164
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 62.007568280756246,
14
+ "avg_duration": 0.5734055519104004
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 24.322912970242964,
18
+ "avg_duration": 0.3130455732345581
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 21.098893815003613,
22
+ "avg_duration": 1.034156036376953
23
+ },
24
+ "MultiMed": {
25
+ "per": 37.90138577574676,
26
+ "avg_duration": 1.0464757680892944
27
+ }
28
+ }
29
+ }
eval-results/results_1759479712_Timit.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "Timit",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 32.78310772297904,
10
+ "avg_duration": 1.0769179582595825
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 42.393439204382865,
14
+ "avg_duration": 1.4808897733688355
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 28.852864777541704,
18
+ "avg_duration": 0.8038362503051758
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 28.88432664616071,
22
+ "avg_duration": 2.5855883836746214
23
+ },
24
+ "MultiMed": {
25
+ "per": 42.29417929178023,
26
+ "avg_duration": 2.4689067125320436
27
+ }
28
+ }
29
+ }
eval-results/results_1759479712_WavLM.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "WavLM",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 25.04219454527341,
10
+ "avg_duration": 1.054517960548401
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 63.40875812391994,
14
+ "avg_duration": 1.476344680786133
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 22.821457511149568,
18
+ "avg_duration": 0.7534051895141601
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 36.13438162282092,
22
+ "avg_duration": 2.5621693611145018
23
+ },
24
+ "MultiMed": {
25
+ "per": 57.01443813462704,
26
+ "avg_duration": 2.337135744094849
27
+ }
28
+ }
29
+ }
eval-results/results_1759479712_Whisper.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "Whisper",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 83.44842270480702,
10
+ "avg_duration": 1.5802977561950684
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 73.97112058868787,
14
+ "avg_duration": 1.4796640157699585
15
+ },
16
+ "timit_asr_ipa": {
17
+ "per": 78.25013458573484,
18
+ "avg_duration": 1.2946593046188355
19
+ },
20
+ "librispeech_asr": {
21
+ "per": 82.02327697665437,
22
+ "avg_duration": 1.9603740453720093
23
+ },
24
+ "MultiMed": {
25
+ "per": 77.10185035170976,
26
+ "avg_duration": 1.68308687210083
27
+ }
28
+ }
29
+ }
phoneme_eval.py CHANGED
@@ -1,7 +1,20 @@
1
  import pandas as pd
2
- from utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
3
  from utils.audio_process import calculate_error_rate, load_audio
4
- from utils.cmu_process import clean_cmu, cmu_to_ipa
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def set_output(model, pre_pho, ref_pho, duration, per, score):
7
  return {
@@ -13,14 +26,6 @@ def set_output(model, pre_pho, ref_pho, duration, per, score):
13
  "score": score
14
  }
15
 
16
- # Map model names to their runner functions
17
- MODEL_RUNNERS = {
18
- "HuBERT-Base": run_hubert_base,
19
- "Whisper": run_whisper,
20
- "HuBERT fine-tuned": run_model,
21
- "Timit": run_timit
22
- }
23
-
24
  def get_output(model, wav, reference_phoneme):
25
  """
26
  Run the given model, compute error rate, and return formatted output.
@@ -50,6 +55,8 @@ def benchmark_all(example):
50
  get_output("Whisper", wav, reference_phoneme),
51
  get_output("HuBERT fine-tuned", wav, reference_phoneme),
52
  get_output("Timit", wav, reference_phoneme),
 
 
53
  ]
54
 
55
  return pd.DataFrame(results)
@@ -75,25 +82,79 @@ def benchmark_dataset(dataset):
75
 
76
  return full_df, avg_stats
77
 
78
-
79
- from datasets import load_dataset, Audio
80
-
81
- DATASET_LIST = [
82
- "mirfan899/phoneme_asr",
83
- "mirfan899/kids_phoneme_md",
84
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  def main():
87
- field = "phonetic"
88
-
89
- # Collect per-model metrics across datasets
 
 
 
 
 
90
  per_model_results = {}
91
 
92
- for dataset_name in DATASET_LIST:
93
- try:
94
- dataset = load_dataset(dataset_name, split="train")
95
- except Exception as e:
96
- print(f"[warn] skip dataset {dataset_name}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
97
  continue
98
 
99
  try:
@@ -101,27 +162,74 @@ def main():
101
  except Exception:
102
  pass
103
 
104
- unique_texts = dataset.unique(field)
105
- print("Unique phonetic strings (", dataset_name, "):", len(unique_texts))
106
-
107
- dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)
108
-
109
- def is_valid(example):
110
- phoneme_tokens = example[field].split()
111
- return len(phoneme_tokens) >= 10
112
-
113
- dataset_filtered = dataset_unique.filter(is_valid)
114
- dataset_final = dataset_filtered.shuffle(seed=42).select(range(min(100, len(dataset_filtered))))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  print(dataset_final)
117
  print("Final size:", len(dataset_final))
118
 
119
- full_results, avg_stats = benchmark_dataset(dataset_final.select(range(min(10, len(dataset_final)))))
120
- print("Average Statistic per model (", dataset_name, "):")
 
 
121
  print(avg_stats)
122
 
123
  # Use dataset name as key (extract the actual name part)
124
- dataset_key = dataset_name.split("/")[-1] # Get the last part after the slash
125
  for _, row in avg_stats.iterrows():
126
  model_name = str(row["model"]).replace(" ", "-")
127
  per = float(row["Average PER"]) if row["Average PER"] is not None else None
 
1
  import pandas as pd
2
+ from utils.load_model import run_hubert_base, run_whisper, run_model, run_timit, run_wavlm_large_phoneme, run_gruut
3
  from utils.audio_process import calculate_error_rate, load_audio
4
+ from utils.cmu_process import clean_cmu, cmu_to_ipa, text_to_phoneme
5
+ from constants import DATASETS
6
+ from datasets import load_dataset, Audio
7
+ import argparse
8
+
9
+ # Map model names to their runner functions
10
+ MODEL_RUNNERS = {
11
+ "HuBERT-Base": run_hubert_base,
12
+ "Whisper": run_whisper,
13
+ "HuBERT fine-tuned": run_model,
14
+ "Timit": run_timit,
15
+ "speech31/wavlm-large-english-phoneme": run_wavlm_large_phoneme,
16
+ "bookbot/wav2vec2-ljspeech-gruut": run_gruut,
17
+ }
18
 
19
  def set_output(model, pre_pho, ref_pho, duration, per, score):
20
  return {
 
26
  "score": score
27
  }
28
 
 
 
 
 
 
 
 
 
29
  def get_output(model, wav, reference_phoneme):
30
  """
31
  Run the given model, compute error rate, and return formatted output.
 
55
  get_output("Whisper", wav, reference_phoneme),
56
  get_output("HuBERT fine-tuned", wav, reference_phoneme),
57
  get_output("Timit", wav, reference_phoneme),
58
+ get_output("WavLM", wav, reference_phoneme),
59
+ get_output("LJSpeech Gruut", wav, reference_phoneme),
60
  ]
61
 
62
  return pd.DataFrame(results)
 
82
 
83
  return full_df, avg_stats
84
 
85
+ def load_dataset_with_limits(dataset_config, max_samples=None, use_streaming=False):
86
+ """
87
+ Load a dataset with optional size limits and streaming.
88
+
89
+ Args:
90
+ dataset_config: Dictionary containing dataset configuration
91
+ max_samples: Maximum number of samples to load (None for no limit)
92
+ use_streaming: Whether to use streaming for large datasets
93
+
94
+ Returns:
95
+ Dataset object
96
+ """
97
+ try:
98
+ # Prepare load_dataset arguments
99
+ load_args = {
100
+ "path": dataset_config["name"],
101
+ "split": dataset_config["split"]
102
+ }
103
+
104
+ # Add config if specified
105
+ if "config" in dataset_config:
106
+ load_args["name"] = dataset_config["config"]
107
+
108
+ # Add streaming if requested
109
+ if use_streaming:
110
+ load_args["streaming"] = True
111
+ print(f"Loading {dataset_config['name']} with streaming...")
112
+ else:
113
+ print(f"Loading {dataset_config['name']}...")
114
+
115
+ dataset = load_dataset(**load_args)
116
+
117
+ # Apply size limits
118
+ if max_samples is not None:
119
+ print(f"Limiting dataset to {max_samples} samples...")
120
+ if use_streaming:
121
+ dataset = dataset.take(max_samples)
122
+ else:
123
+ dataset = dataset.select(range(min(max_samples, len(dataset))))
124
+
125
+ return dataset
126
+ except Exception as e:
127
+ print(f"[warn] skip dataset {dataset_config['name']}: {e}")
128
+ return None
129
 
130
  def main():
131
+ # Parse command line arguments
132
+ parser = argparse.ArgumentParser(description='Phoneme Detection Evaluation')
133
+ parser.add_argument('--max-samples', type=int, default=None,
134
+ help='Override max_samples for all datasets')
135
+ parser.add_argument('--dataset', type=str, default=None,
136
+ help='Process only specific dataset (by name)')
137
+ args = parser.parse_args()
138
+
139
  per_model_results = {}
140
 
141
+ for dataset_config in DATASETS:
142
+ # Skip dataset if specific dataset is requested and this isn't it
143
+ if args.dataset and args.dataset not in dataset_config["name"]:
144
+ continue
145
+
146
+ # Override max_samples if provided via command line
147
+ max_samples = args.max_samples if args.max_samples is not None else dataset_config.get("max_samples")
148
+ use_streaming = dataset_config.get("use_streaming", False)
149
+
150
+ # Load dataset with limits
151
+ dataset = load_dataset_with_limits(
152
+ dataset_config,
153
+ max_samples=max_samples,
154
+ use_streaming=use_streaming
155
+ )
156
+
157
+ if dataset is None:
158
  continue
159
 
160
  try:
 
162
  except Exception:
163
  pass
164
 
165
+ field = dataset_config["field"]
166
+
167
+ # Handle streaming datasets differently
168
+ if use_streaming:
169
+ print("Processing streaming dataset...")
170
+ valid_samples = []
171
+
172
+ # Set a reasonable limit for streaming (max 100 samples)
173
+ streaming_limit = max(max_samples or 100, 100)
174
+
175
+ for example in dataset:
176
+ # Convert text to phonemes if needed
177
+ if field == "text":
178
+ phonetic_text = text_to_phoneme(example[field])
179
+ example = {**example, "phonetic": phonetic_text}
180
+ current_field = "phonetic"
181
+ else:
182
+ current_field = field
183
+
184
+ # Check if valid
185
+ if current_field in example:
186
+ phoneme_tokens = example[current_field].split()
187
+ if len(phoneme_tokens) >= 10:
188
+ valid_samples.append(example)
189
+ # Stop when we reach the streaming limit
190
+ if len(valid_samples) >= streaming_limit:
191
+ break
192
+
193
+ print(f"Found {len(valid_samples)} valid samples")
194
+ if len(valid_samples) == 0:
195
+ print("No valid samples found, skipping dataset")
196
+ continue
197
+
198
+ # Convert to regular dataset for processing
199
+ from datasets import Dataset
200
+ dataset_final = Dataset.from_list(valid_samples)
201
+ field = "phonetic" if field == "text" else field
202
+ else:
203
+ # Regular dataset processing
204
+ if field == "text":
205
+ dataset = dataset.map(lambda x: {"phonetic": text_to_phoneme(x[field])})
206
+ field = "phonetic"
207
+
208
+ unique_texts = dataset.unique(field)
209
+ print("Unique phonetic strings (", dataset_config["name"], "):", len(unique_texts))
210
+
211
+ dataset_unique = dataset.filter(lambda x: x[field] in unique_texts)
212
+
213
+ def is_valid(example):
214
+ phoneme_tokens = example[field].split()
215
+ return len(phoneme_tokens) >= 10
216
+
217
+ dataset_filtered = dataset_unique.filter(is_valid)
218
+ # Use smaller final size for evaluation
219
+ final_size = min(100, len(dataset_filtered))
220
+ dataset_final = dataset_filtered.shuffle(seed=42).select(range(final_size))
221
 
222
  print(dataset_final)
223
  print("Final size:", len(dataset_final))
224
 
225
+ # Limit to 10 samples for benchmarking
226
+ benchmark_size = min(10, len(dataset_final))
227
+ full_results, avg_stats = benchmark_dataset(dataset_final.select(range(benchmark_size)))
228
+ print("Average Statistic per model (", dataset_config["name"], "):")
229
  print(avg_stats)
230
 
231
  # Use dataset name as key (extract the actual name part)
232
+ dataset_key = dataset_config["name"].split("/")[-1] # Get the last part after the slash
233
  for _, row in avg_stats.iterrows():
234
  model_name = str(row["model"]).replace(" ", "-")
235
  per = float(row["Average PER"]) if row["Average PER"] is not None else None
test_basic.py DELETED
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Basic test to verify the cleaned up phoneme detection leaderboard functionality.
4
- """
5
-
6
- import os
7
- import sys
8
- import json
9
- import tempfile
10
- import pandas as pd
11
-
12
- # Add current directory to path
13
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
14
-
15
- def test_imports():
16
- """Test that all modules can be imported"""
17
- try:
18
- from constants import BANNER, INTRODUCTION_TEXT
19
- from utils_display import PhonemeEvalColumn, make_clickable_model
20
- from init import is_model_on_hub
21
- print("All imports successful")
22
- return True
23
- except ImportError as e:
24
- print(f"Import error: {e}")
25
- return False
26
-
27
- def test_data_loading():
28
- """Test that the app can load data from eval-results directory"""
29
- try:
30
- from app import load_results, EVAL_RESULTS_DIR
31
-
32
- # Create a temporary test result
33
- os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)
34
- test_result = {
35
- "config": {
36
- "model_name": "test/model",
37
- "model_dtype": "float32",
38
- "model_sha": "test123"
39
- },
40
- "results": {
41
- "phoneme_asr": {"per": 15.5, "avg_duration": 0.1},
42
- "kids_phoneme_md": {"per": 18.2, "avg_duration": 0.12}
43
- }
44
- }
45
-
46
- test_file = os.path.join(EVAL_RESULTS_DIR, "test_results.json")
47
- with open(test_file, "w") as f:
48
- json.dump(test_result, f)
49
-
50
- # Test loading
51
- df = load_results(EVAL_RESULTS_DIR)
52
- print(f"Data loading successful, found {len(df)} rows")
53
-
54
- # Clean up
55
- os.remove(test_file)
56
- return True
57
-
58
- except Exception as e:
59
- print(f"Data loading error: {e}")
60
- return False
61
-
62
- def test_utils():
63
- """Test utility functions"""
64
- try:
65
- from utils_display import make_clickable_model, styled_error, styled_message
66
-
67
- # Test model link generation
68
- link = make_clickable_model("facebook/hubert-base")
69
- assert "facebook/hubert-base" in link
70
- assert "href=" in link
71
-
72
- # Test styled messages
73
- error_msg = styled_error("Test error")
74
- assert "red" in error_msg
75
-
76
- success_msg = styled_message("Test success")
77
- assert "green" in success_msg
78
-
79
- print("Utility functions working")
80
- return True
81
-
82
- except Exception as e:
83
- print(f"Utility test error: {e}")
84
- return False
85
-
86
- def main():
87
- """Run all tests"""
88
- print("Testing Phoneme Detection Leaderboard...")
89
-
90
- tests = [
91
- test_imports,
92
- test_data_loading,
93
- test_utils
94
- ]
95
-
96
- passed = 0
97
- total = len(tests)
98
-
99
- for test in tests:
100
- if test():
101
- passed += 1
102
- print()
103
-
104
- print(f"Test Results: {passed}/{total} tests passed")
105
-
106
- if passed == total:
107
- print("All tests passed! The cleaned up version is working correctly.")
108
- return True
109
- else:
110
- print("Some tests failed. Please check the errors above.")
111
- return False
112
-
113
- if __name__ == "__main__":
114
- success = main()
115
- sys.exit(0 if success else 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/load_model.py CHANGED
@@ -4,11 +4,9 @@ import torch
4
  import torchaudio
5
  from transformers import (
6
  Wav2Vec2Processor, HubertForCTC,
7
- WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC
8
  )
9
  from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
10
- from .cmu_process import clean_cmu
11
- from .cmu_process import cmu_to_ipa
12
 
13
  from dotenv import load_dotenv
14
 
@@ -48,6 +46,16 @@ model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKE
48
  timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
49
  timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
50
 
 
 
 
 
 
 
 
 
 
 
51
  # === Inference functions ===
52
 
53
  def run_hubert_base(wav):
@@ -116,3 +124,50 @@ def run_timit(wav):
116
  phonemes = "".join(phonemes)
117
 
118
  return phonemes.strip(), time.time() - start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import torchaudio
5
  from transformers import (
6
  Wav2Vec2Processor, HubertForCTC,
7
+ WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC, AutoProcessor, AutoModelForCTC
8
  )
9
  from .cmu_process import text_to_phoneme, cmu_to_ipa, clean_cmu
 
 
10
 
11
  from dotenv import load_dotenv
12
 
 
46
  timit_proc = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
47
  timit_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme").to(device).eval()
48
 
49
+
50
+ # 5 bookbot/wav2vec2-ljspeech-gruut
51
+ gruut_processor = AutoProcessor.from_pretrained("bookbot/wav2vec2-ljspeech-gruut")
52
+ gruut_model = AutoModelForCTC.from_pretrained("bookbot/wav2vec2-ljspeech-gruut").to(device).eval()
53
+
54
+ # 6 microsoft/wavlm-large-english-phoneme
55
+ wavlm_proc = AutoProcessor.from_pretrained("speech31/wavlm-large-english-phoneme")
56
+ wavlm_model = AutoModelForCTC.from_pretrained("speech31/wavlm-large-english-phoneme").to(device).eval()
57
+
58
+
59
  # === Inference functions ===
60
 
61
  def run_hubert_base(wav):
 
124
  phonemes = "".join(phonemes)
125
 
126
  return phonemes.strip(), time.time() - start
127
+
128
+
129
+ def run_gruut(wav):
130
+ start = time.time()
131
+
132
+ # Preprocess waveform → model input
133
+ inputs = gruut_processor(
134
+ wav,
135
+ sampling_rate=16000,
136
+ return_tensors="pt",
137
+ padding=True
138
+ ).to(device)
139
+
140
+ # Forward pass
141
+ with torch.no_grad():
142
+ logits = gruut_model(**inputs).logits
143
+
144
+ # Greedy decode → IPA phonemes
145
+ pred_ids = torch.argmax(logits, dim=-1)
146
+ phonemes = gruut_processor.batch_decode(pred_ids)[0]
147
+ phonemes = "".join(phonemes)
148
+
149
+ return phonemes.strip(), time.time() - start
150
+
151
+ def run_wavlm_large_phoneme(wav):
152
+ start = time.time()
153
+
154
+ # Preprocess waveform → model input
155
+ inputs = wavlm_proc(
156
+ wav,
157
+ sampling_rate=16000,
158
+ return_tensors="pt",
159
+ padding=True
160
+ ).to(device)
161
+
162
+ input_values = inputs.input_values
163
+ attention_mask = inputs.get("attention_mask", None)
164
+
165
+ # Forward pass
166
+ with torch.no_grad():
167
+ logits = wavlm_model(input_values, attention_mask=attention_mask).logits
168
+
169
+ # Greedy decode → phoneme tokens
170
+ pred_ids = torch.argmax(logits, dim=-1)
171
+ phonemes = wavlm_proc.batch_decode(pred_ids)[0]
172
+ phonemes = "".join(phonemes)
173
+ return phonemes.strip(), time.time() - start