Spaces:
Sleeping
Sleeping
Fix leaderboard startup and simplify to core functionality
Browse filesMajor fixes:
- Fix fields() function to work with both make_dataclass and @dataclass
- Fix column name mapping (model -> Model, average -> Average, etc.)
- Fix JSON file filtering logic that was skipping result files
- Fix search_columns references to use correct case-sensitive names
- Remove unnecessary metadata columns (precision, license, params, etc.)
- Simplify to core leaderboard: Model name + task scores + average
The app now starts successfully and displays a clean leaderboard
focused on model performance comparison across NER tasks.
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app.py +4 -24
- src/display/utils.py +10 -8
- src/leaderboard/read_evals.py +11 -19
- src/populate.py +2 -2
app.py
CHANGED
|
@@ -67,36 +67,16 @@ def init_leaderboard(dataframe):
|
|
| 67 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 68 |
label="Select Columns to Display:",
|
| 69 |
),
|
| 70 |
-
search_columns=["
|
| 71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 72 |
filter_columns=[],
|
| 73 |
bool_checkboxgroup_label="Hide models",
|
| 74 |
interactive=False,
|
| 75 |
)
|
| 76 |
|
| 77 |
-
# Build filter columns
|
| 78 |
filter_columns = []
|
| 79 |
|
| 80 |
-
# Add precision filter only if precision column has data
|
| 81 |
-
if "precision" in dataframe.columns and not dataframe["precision"].isna().all():
|
| 82 |
-
filter_columns.append(ColumnFilter("precision", type="checkboxgroup", label="Precision"))
|
| 83 |
-
|
| 84 |
-
# Add params filter only if params column has data
|
| 85 |
-
if "params" in dataframe.columns and not dataframe["params"].isna().all():
|
| 86 |
-
filter_columns.append(ColumnFilter(
|
| 87 |
-
"params",
|
| 88 |
-
type="slider",
|
| 89 |
-
min=0.01,
|
| 90 |
-
max=150,
|
| 91 |
-
label="Select the number of parameters (B)",
|
| 92 |
-
))
|
| 93 |
-
|
| 94 |
-
# Add still_on_hub filter only if column has data
|
| 95 |
-
if "still_on_hub" in dataframe.columns and not dataframe["still_on_hub"].isna().all():
|
| 96 |
-
filter_columns.append(ColumnFilter(
|
| 97 |
-
"still_on_hub", type="boolean", label="Deleted/incomplete", default=True
|
| 98 |
-
))
|
| 99 |
-
|
| 100 |
return Leaderboard(
|
| 101 |
value=dataframe,
|
| 102 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
@@ -105,7 +85,7 @@ def init_leaderboard(dataframe):
|
|
| 105 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 106 |
label="Select Columns to Display:",
|
| 107 |
),
|
| 108 |
-
search_columns=[
|
| 109 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 110 |
filter_columns=filter_columns,
|
| 111 |
bool_checkboxgroup_label="Hide models",
|
|
@@ -173,7 +153,7 @@ with demo:
|
|
| 173 |
model_name_textbox = gr.Textbox(label="Nom du modèle")
|
| 174 |
revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
|
| 175 |
precision = gr.Dropdown(
|
| 176 |
-
choices=[
|
| 177 |
label="Précision",
|
| 178 |
multiselect=False,
|
| 179 |
value="float16",
|
|
|
|
| 67 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 68 |
label="Select Columns to Display:",
|
| 69 |
),
|
| 70 |
+
search_columns=["Model"],
|
| 71 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 72 |
filter_columns=[],
|
| 73 |
bool_checkboxgroup_label="Hide models",
|
| 74 |
interactive=False,
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# Build filter columns - simplified since we removed most metadata columns
|
| 78 |
filter_columns = []
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
return Leaderboard(
|
| 81 |
value=dataframe,
|
| 82 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
|
| 85 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 86 |
label="Select Columns to Display:",
|
| 87 |
),
|
| 88 |
+
search_columns=["Model"],
|
| 89 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 90 |
filter_columns=filter_columns,
|
| 91 |
bool_checkboxgroup_label="Hide models",
|
|
|
|
| 153 |
model_name_textbox = gr.Textbox(label="Nom du modèle")
|
| 154 |
revision_name_textbox = gr.Textbox(label="Révision commit", placeholder="main")
|
| 155 |
precision = gr.Dropdown(
|
| 156 |
+
choices=["float16", "bfloat16"],
|
| 157 |
label="Précision",
|
| 158 |
multiselect=False,
|
| 159 |
value="float16",
|
src/display/utils.py
CHANGED
|
@@ -6,7 +6,15 @@ import pandas as pd
|
|
| 6 |
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
# These classes are for user facing column names,
|
|
@@ -30,13 +38,7 @@ auto_eval_column_dict.append(("average", ColumnContent("Average", "number", True
|
|
| 30 |
#Scores
|
| 31 |
for task in Tasks:
|
| 32 |
auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
|
| 33 |
-
# Model information
|
| 34 |
-
auto_eval_column_dict.append(("precision", ColumnContent("Precision", "str", False)))
|
| 35 |
-
auto_eval_column_dict.append(("license", ColumnContent("Hub License", "str", False)))
|
| 36 |
-
auto_eval_column_dict.append(("params", ColumnContent("#Params (B)", "number", False)))
|
| 37 |
-
auto_eval_column_dict.append(("likes", ColumnContent("Hub ❤️", "number", False)))
|
| 38 |
-
auto_eval_column_dict.append(("still_on_hub", ColumnContent("Available on the hub", "bool", False)))
|
| 39 |
-
auto_eval_column_dict.append(("revision", ColumnContent("Model sha", "str", False, False)))
|
| 40 |
|
| 41 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 42 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 6 |
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
+
if hasattr(raw_class, '__dataclass_fields__'):
|
| 10 |
+
# For make_dataclass created classes
|
| 11 |
+
if raw_class.__dataclass_fields__:
|
| 12 |
+
return [field.type for field in raw_class.__dataclass_fields__.values()]
|
| 13 |
+
else:
|
| 14 |
+
# For regular @dataclass with empty __dataclass_fields__, check __dict__
|
| 15 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
|
| 16 |
+
# Fallback for non-dataclass
|
| 17 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" and hasattr(v, 'name')]
|
| 18 |
|
| 19 |
|
| 20 |
# These classes are for user facing column names,
|
|
|
|
| 38 |
#Scores
|
| 39 |
for task in Tasks:
|
| 40 |
auto_eval_column_dict.append((task.name, ColumnContent(task.value.col_name, "number", True)))
|
| 41 |
+
# Model information - simplified to only essential columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -129,18 +129,9 @@ class EvalResult:
|
|
| 129 |
average = sum(valid_results) / len(valid_results) if valid_results else 0.0
|
| 130 |
data_dict = {
|
| 131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
"
|
| 135 |
-
"weight_type": self.weight_type.value.name,
|
| 136 |
-
"architecture": self.architecture,
|
| 137 |
-
"model": make_clickable_model(self.full_model),
|
| 138 |
-
"revision": self.revision,
|
| 139 |
-
"average": average,
|
| 140 |
-
"license": self.license,
|
| 141 |
-
"likes": self.likes,
|
| 142 |
-
"params": self.num_params,
|
| 143 |
-
"still_on_hub": self.still_on_hub,
|
| 144 |
}
|
| 145 |
|
| 146 |
for task in Tasks:
|
|
@@ -176,17 +167,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 176 |
model_result_filepaths = []
|
| 177 |
|
| 178 |
for root, _, files in os.walk(results_path):
|
| 179 |
-
# We
|
| 180 |
-
|
|
|
|
| 181 |
continue
|
| 182 |
|
| 183 |
-
# Sort the files by date
|
| 184 |
try:
|
| 185 |
-
|
| 186 |
-
except
|
| 187 |
-
|
| 188 |
|
| 189 |
-
for file in
|
| 190 |
model_result_filepaths.append(os.path.join(root, file))
|
| 191 |
|
| 192 |
eval_results = {}
|
|
|
|
| 129 |
average = sum(valid_results) / len(valid_results) if valid_results else 0.0
|
| 130 |
data_dict = {
|
| 131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 132 |
+
"T": self.model_type.value.symbol,
|
| 133 |
+
"Model": make_clickable_model(self.full_model),
|
| 134 |
+
"Average": average,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
|
| 137 |
for task in Tasks:
|
|
|
|
| 167 |
model_result_filepaths = []
|
| 168 |
|
| 169 |
for root, _, files in os.walk(results_path):
|
| 170 |
+
# We need at least one json file in model results
|
| 171 |
+
json_files = [f for f in files if f.endswith(".json")]
|
| 172 |
+
if len(json_files) == 0:
|
| 173 |
continue
|
| 174 |
|
| 175 |
+
# Sort the JSON files by date
|
| 176 |
try:
|
| 177 |
+
json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 178 |
+
except Exception:
|
| 179 |
+
json_files = [json_files[-1]] if json_files else []
|
| 180 |
|
| 181 |
+
for file in json_files:
|
| 182 |
model_result_filepaths.append(os.path.join(root, file))
|
| 183 |
|
| 184 |
eval_results = {}
|
src/populate.py
CHANGED
|
@@ -25,12 +25,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 25 |
# Sort by the first task (EMEA NER) since we don't have an average for NER tasks
|
| 26 |
# If no results exist yet, just sort by model name
|
| 27 |
first_task = list(Tasks)[0] # emea_ner
|
| 28 |
-
task_col_name =
|
| 29 |
if task_col_name in df.columns:
|
| 30 |
df = df.sort_values(by=[task_col_name], ascending=False)
|
| 31 |
else:
|
| 32 |
# Fallback to sorting by model name if no task results yet
|
| 33 |
-
df = df.sort_values(by=[
|
| 34 |
|
| 35 |
# Only select columns that exist in the DataFrame
|
| 36 |
available_cols = [col for col in cols if col in df.columns]
|
|
|
|
| 25 |
# Sort by the first task (EMEA NER) since we don't have an average for NER tasks
|
| 26 |
# If no results exist yet, just sort by model name
|
| 27 |
first_task = list(Tasks)[0] # emea_ner
|
| 28 |
+
task_col_name = first_task.value.col_name # Use the col_name directly
|
| 29 |
if task_col_name in df.columns:
|
| 30 |
df = df.sort_values(by=[task_col_name], ascending=False)
|
| 31 |
else:
|
| 32 |
# Fallback to sorting by model name if no task results yet
|
| 33 |
+
df = df.sort_values(by=["Model"], ascending=True)
|
| 34 |
|
| 35 |
# Only select columns that exist in the DataFrame
|
| 36 |
available_cols = [col for col in cols if col in df.columns]
|