Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import snapshot_download | |
| import time | |
| import functools | |
| import gc | |
| import os | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, | |
| LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS, | |
| NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, LOGO, FIVE_PILLAR_DIAGRAM | |
| ) | |
| from src.display.css_html_js import custom_css | |
| from src.display.utils import ( | |
| DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS, | |
| MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS, | |
| HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS, | |
| OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS, | |
| HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS, | |
| TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates, | |
| OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS, | |
| OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, | |
| OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS, | |
| OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, | |
| ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, | |
| EHRSQL_ZERO_SHOT_COLS, EHRSQL_ZERO_SHOT_BENCHMARK_COLS, | |
| EHRSQL_FEW_SHOT_COLS, EHRSQL_FEW_SHOT_BENCHMARK_COLS, | |
| MEDCALC_DIRECT_ANSWER_COLS, MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS, | |
| MEDCALC_ONE_SHOT_COT_COLS, MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS, | |
| MEDCALC_ZERO_SHOT_COT_COLS, MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS, | |
| MEDEC_ZERO_SHOT_COLS, MEDEC_ZERO_SHOT_BENCHMARK_COLS, | |
| MEDEC_ONE_SHOT_COLS, MEDEC_ONE_SHOT_BENCHMARK_COLS, | |
| ) | |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
| from src.populate import get_evaluation_queue_df, get_leaderboard_df | |
| from src.submission.submit import add_new_eval | |
| # ===================================================================================== | |
| # 1. SETUP AND DATA LOADING | |
| # ===================================================================================== | |
| def restart_space(): | |
| API.restart_space(repo_id=REPO_ID) | |
| print("Downloading evaluation data...") | |
| try: | |
| snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN) | |
| snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN) | |
| print("Downloads complete.") | |
| except Exception as e: | |
| print(f"An error occurred during download: {e}") | |
| restart_space() | |
| print("Loading all dataframes into a central dictionary...") | |
| start_time = time.time() | |
| _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets") | |
| _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended") | |
| _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety") | |
| _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization") | |
| _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci") | |
| _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap") | |
| _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench") | |
| _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard") | |
| _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic") | |
| _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french") | |
| _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese") | |
| _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian") | |
| _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek") | |
| _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish") | |
| _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual") | |
| _, ehrsql_zero_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EHRSQL_ZERO_SHOT_COLS, EHRSQL_ZERO_SHOT_BENCHMARK_COLS, "score", "ehrsql_zero_shot") | |
| _, ehrsql_few_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EHRSQL_FEW_SHOT_COLS, EHRSQL_FEW_SHOT_BENCHMARK_COLS, "score", "ehrsql_few_shot") | |
| _, medcalc_direct_answer_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_DIRECT_ANSWER_COLS, MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS, "score", "medcalc_direct_answer") | |
| _, medcalc_one_shot_cot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_ONE_SHOT_COT_COLS, MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS, "score", "medcalc_one_shot_cot") | |
| _, medcalc_zero_shot_cot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_ZERO_SHOT_COT_COLS, MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS, "score", "medcalc_zero_shot_cot") | |
| _, medec_zero_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDEC_ZERO_SHOT_COLS, MEDEC_ZERO_SHOT_BENCHMARK_COLS, "score", "medec_zero_shot") | |
| _, medec_one_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDEC_ONE_SHOT_COLS, MEDEC_ONE_SHOT_BENCHMARK_COLS, "score", "medec_one_shot") | |
| # Debug check for openai/gpt-oss-20b | |
| ALL_DATASETS = { | |
| "datasets": harness_datasets_original_df, | |
| "open_ended": open_ended_original_df, | |
| "med_safety": med_safety_original_df, | |
| "medical_summarization": medical_summarization_original_df, | |
| "aci": aci_original_df, | |
| "soap": soap_original_df, | |
| "healthbench": healthbench_original_df, | |
| "healthbench_hard": healthbench_hard_original_df, | |
| "open_ended_arabic": open_ended_arabic_df, | |
| "open_ended_french": open_ended_french_df, | |
| "open_ended_portuguese": open_ended_portuguese_df, | |
| "open_ended_romanian": open_ended_romanian_df, | |
| "open_ended_greek": open_ended_greek_df, | |
| "open_ended_spanish": open_ended_spanish_df, | |
| "closed_ended_multilingual": closed_ended_multilingual_df, | |
| "ehrsql_zero_shot": ehrsql_zero_shot_df, | |
| "ehrsql_few_shot": ehrsql_few_shot_df, | |
| "medcalc_direct_answer": medcalc_direct_answer_df, | |
| "medcalc_one_shot_cot": medcalc_one_shot_cot_df, | |
| "medcalc_zero_shot_cot": medcalc_zero_shot_cot_df, | |
| "medec_zero_shot": medec_zero_shot_df, | |
| "medec_one_shot": medec_one_shot_df, | |
| } | |
| end_time = time.time() | |
| print(f"Dataframes loaded in {end_time - start_time:.2f} seconds.") | |
| # Evaluation Queue DataFrames | |
| (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) | |
| # ===================================================================================== | |
| # 2. EFFICIENT FILTERING LOGIC | |
| # ===================================================================================== | |
| def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame: | |
| return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))] | |
| def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame: | |
| final_df = [] | |
| if query != "": | |
| queries = [q.strip() for q in query.split(";")] | |
| for _q in queries: | |
| _q = _q.strip() | |
| if _q != "": | |
| temp_filtered_df = search_table(filtered_df, _q) | |
| if len(temp_filtered_df) > 0: | |
| final_df.append(temp_filtered_df) | |
| if len(final_df) > 0: | |
| filtered_df = pd.concat(final_df) | |
| filtered_df = filtered_df.drop_duplicates( | |
| subset=[ | |
| AutoEvalColumn.model.name, | |
| ] | |
| ) | |
| return filtered_df | |
| def filter_models( | |
| df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool | |
| ) -> pd.DataFrame: | |
| filtered_df = df | |
| if type_query is not None: | |
| type_name = [t.split(" ")[1] for t in type_query] | |
| filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)] | |
| if domain_specific_query is not None: | |
| domain_specifics = [] | |
| if "π₯ Clinical models" in domain_specific_query: | |
| domain_specifics.append(True) | |
| if "Generic models" in domain_specific_query: | |
| domain_specifics.append(False) | |
| filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)] | |
| if precision_query is not None: | |
| if AutoEvalColumn.precision.name in df.columns: | |
| filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])] | |
| if size_query is not None: | |
| numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query])) | |
| params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce") | |
| mask = params_column.apply(lambda x: any(numeric_interval.contains(x))) | |
| filtered_df = filtered_df.loc[mask] | |
| return filtered_df | |
| def get_filtered_table( | |
| shown_columns: list, | |
| query: str, | |
| domain_specific_query: list, | |
| size_query: list, | |
| *, # force subset_name to be a keyword-only argument | |
| subset_name: str | |
| ): | |
| original_df = ALL_DATASETS[subset_name] | |
| type_query = None | |
| filtered_df = filter_models(original_df, type_query, domain_specific_query, size_query, None, False) | |
| filtered_df = filter_queries(query, filtered_df) | |
| always_here_cols = [AutoEvalColumn.model.name] | |
| available_cols = [c for c in shown_columns if c in filtered_df.columns] | |
| final_df = filtered_df[always_here_cols + available_cols] | |
| del filtered_df | |
| gc.collect() | |
| return final_df | |
| # ===================================================================================== | |
| # 3. REUSABLE UI CREATION FUNCTION | |
| # ===================================================================================== | |
| def create_leaderboard_ui(subset_name: str, column_choices: list, default_columns: list): | |
| """Creates a full leaderboard UI block for a given subset.""" | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| search_bar = gr.Textbox( | |
| placeholder=f"π Search for models...", | |
| show_label=False, | |
| elem_id=f"search-bar-{subset_name}", | |
| ) | |
| with gr.Row(): | |
| shown_columns = gr.CheckboxGroup( | |
| choices=column_choices, | |
| value=default_columns, | |
| label="Select columns to show", | |
| elem_id=f"column-select-{subset_name}", | |
| interactive=True, | |
| ) | |
| with gr.Column(min_width=320): | |
| filter_domain_specific = gr.CheckboxGroup( | |
| label="Domain Specificity", | |
| choices=["π₯ Clinical models", "Generic models"], | |
| value=["π₯ Clinical models", "Generic models"], | |
| interactive=True, | |
| elem_id=f"filter-domain-{subset_name}", | |
| ) | |
| filter_columns_size = gr.CheckboxGroup( | |
| label="Model sizes (in billions of parameters)", | |
| choices=list(NUMERIC_INTERVALS.keys()), | |
| value=list(NUMERIC_INTERVALS.keys()), | |
| interactive=True, | |
| elem_id=f"filter-size-{subset_name}", | |
| ) | |
| update_fn = functools.partial(get_filtered_table, subset_name=subset_name) | |
| initial_df = update_fn( | |
| shown_columns=default_columns, | |
| query="", | |
| domain_specific_query=["π₯ Clinical models", "Generic models"], | |
| size_query=list(NUMERIC_INTERVALS.keys()) | |
| ) | |
| leaderboard_table = gr.Dataframe( | |
| value=initial_df, | |
| headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + default_columns, | |
| datatype=TYPES, | |
| elem_id=f"leaderboard-table-{subset_name}", | |
| interactive=False, | |
| ) | |
| inputs = [shown_columns, search_bar, filter_domain_specific, filter_columns_size] | |
| # Attach listeners to all input components | |
| for component in inputs: | |
| if isinstance(component, gr.Textbox): | |
| component.submit(update_fn, inputs, leaderboard_table) | |
| else: | |
| component.change(update_fn, inputs, leaderboard_table) | |
| return leaderboard_table | |
| # ===================================================================================== | |
| # 4. GRADIO DEMO UI (Main application layout) | |
| # ===================================================================================== | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| gr.HTML(LOGO) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("π Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1): | |
| with gr.Tabs(elem_classes="tab-buttons6") as language_tabs: | |
| LANGUAGES = { | |
| "πΊπΈ English": "open_ended", "π¦πͺ Arabic": "open_ended_arabic", | |
| "π«π· French": "open_ended_french", "πͺπΈ Spanish": "open_ended_spanish", | |
| "π΅πΉ Portuguese": "open_ended_portuguese", "π·π΄ Romanian": "open_ended_romanian", | |
| "π¬π· Greek": "open_ended_greek", | |
| } | |
| for idx, (label, subset) in enumerate(LANGUAGES.items()): | |
| with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx): | |
| judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English." if label == "πΊπΈ English" else "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language." | |
| gr.Markdown(judge_text, elem_classes="markdown-text") | |
| create_leaderboard_ui( | |
| subset_name=subset, | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)] | |
| ) | |
| with gr.Accordion("π¬ Generation templates", open=False): | |
| with gr.Accordion("Response generation", open=False): | |
| render_generation_templates(task="open_ended", generation_type="response_generation") | |
| with gr.Accordion("Scoring Rubric", open=False): | |
| render_generation_templates(task="open_ended", generation_type="scoring_rubric") | |
| with gr.TabItem("π Medical Summarization", elem_id="llm-benchmark-tab-table", id=2): | |
| gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text") | |
| create_leaderboard_ui( | |
| subset_name="medical_summarization", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)] | |
| ) | |
| with gr.Accordion("π¬ Generation templates", open=False): | |
| with gr.Accordion("Response generation", open=False): | |
| render_generation_templates(task="medical_summarization", generation_type="response_generation") | |
| with gr.Accordion("Question generation", open=False): | |
| render_generation_templates(task="ce", generation_type="question_generation") | |
| with gr.Accordion("Cross Examination", open=False): | |
| render_generation_templates(task="ce", generation_type="cross_examination") | |
| with gr.TabItem("π Note generation", elem_id="llm-benchmark-tab-table", id=3): | |
| gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("ACI Bench", id=0): | |
| create_leaderboard_ui( | |
| subset_name="aci", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)] | |
| ) | |
| with gr.TabItem("SOAP Notes", id=1): | |
| create_leaderboard_ui( | |
| subset_name="soap", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)] | |
| ) | |
| # Add accordions for this section if needed, similar to other tabs | |
| with gr.TabItem("π HealthBench", elem_id="llm-benchmark-tab-table", id=4): | |
| gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("HealthBench", id=0): | |
| create_leaderboard_ui( | |
| subset_name="healthbench", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)] | |
| ) | |
| with gr.TabItem("HealthBench-Hard", id=1): | |
| create_leaderboard_ui( | |
| subset_name="healthbench_hard", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)] | |
| ) | |
| with gr.TabItem("π Med Safety", elem_id="llm-benchmark-tab-table", id=5): | |
| create_leaderboard_ui( | |
| subset_name="med_safety", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)] | |
| ) | |
| with gr.Accordion("π¬ Generation templates", open=False): | |
| with gr.Accordion("Response generation", open=False): | |
| render_generation_templates(task="med_safety", generation_type="response_generation") | |
| with gr.Accordion("Scoring Rubric", open=False): | |
| render_generation_templates(task="med_safety", generation_type="scoring_rubric") | |
| with gr.TabItem("π Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6): | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("English", id=0): | |
| create_leaderboard_ui( | |
| subset_name="datasets", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)] | |
| ) | |
| with gr.TabItem("π Multilingual", id=1): | |
| gr.Markdown("π **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)") | |
| create_leaderboard_ui( | |
| subset_name="closed_ended_multilingual", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)] | |
| ) | |
| with gr.TabItem("π EHRSQL", elem_id="llm-benchmark-tab-table", id=7): | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("Zero Shot", id=0): | |
| create_leaderboard_ui( | |
| subset_name="ehrsql_zero_shot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_zero_shot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_zero_shot_col)] | |
| ) | |
| with gr.TabItem("Few Shot", id=1): | |
| create_leaderboard_ui( | |
| subset_name="ehrsql_few_shot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_few_shot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_few_shot_col)] | |
| ) | |
| with gr.TabItem("π MedCalc", elem_id="llm-benchmark-tab-table", id=8): | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("Direct Answer", id=0): | |
| create_leaderboard_ui( | |
| subset_name="medcalc_direct_answer", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_direct_answer_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_direct_answer_col)] | |
| ) | |
| with gr.TabItem("One Shot CoT", id=1): | |
| create_leaderboard_ui( | |
| subset_name="medcalc_one_shot_cot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_one_shot_cot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_one_shot_cot_col)] | |
| ) | |
| with gr.TabItem("Zero Shot CoT", id=2): | |
| create_leaderboard_ui( | |
| subset_name="medcalc_zero_shot_cot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_zero_shot_cot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_zero_shot_cot_col)] | |
| ) | |
| with gr.TabItem("π MedEC", elem_id="llm-benchmark-tab-table", id=9): | |
| with gr.Tabs(elem_classes="tab-buttons2"): | |
| with gr.TabItem("Zero Shot", id=0): | |
| create_leaderboard_ui( | |
| subset_name="medec_zero_shot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medec_zero_shot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medec_zero_shot_col)] | |
| ) | |
| with gr.TabItem("One Shot", id=1): | |
| create_leaderboard_ui( | |
| subset_name="medec_one_shot", | |
| column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medec_one_shot_col)], | |
| default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medec_one_shot_col)] | |
| ) | |
| with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=10): | |
| gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text") | |
| gr.HTML(FIVE_PILLAR_DIAGRAM) | |
| gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text") | |
| with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=11): | |
| with gr.Column(): | |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
| with gr.Accordion(f"β Finished Evaluations ({len(finished_eval_queue_df)})", open=False): | |
| gr.Dataframe(value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) | |
| with gr.Accordion(f"π Running Evaluation Queue ({len(running_eval_queue_df)})", open=False): | |
| gr.Dataframe(value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) | |
| with gr.Accordion(f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False): | |
| gr.Dataframe(value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) | |
| with gr.Row(): | |
| gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_name_textbox = gr.Textbox(label="Model name") | |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") | |
| model_type = gr.Dropdown( | |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], | |
| label="Model type", | |
| multiselect=False, | |
| value=None, | |
| interactive=True, | |
| ) | |
| with gr.Column(): | |
| precision = gr.Dropdown( | |
| choices=[i.value.name for i in Precision if i != Precision.Unknown], | |
| label="Precision", | |
| multiselect=False, | |
| value="auto", | |
| interactive=True, | |
| ) | |
| weight_type = gr.Dropdown( | |
| choices=[i.value.name for i in WeightType], | |
| label="Weights type", | |
| multiselect=False, | |
| value=WeightType.Original.value.name, | |
| interactive=False, | |
| ) | |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)", interactive=False) | |
| with gr.Row(): | |
| domain_specific_toggle = gr.Checkbox( | |
| label="Domain specific", | |
| value=False, | |
| info="Is your model medically oriented?", | |
| ) | |
| chat_template_toggle = gr.Checkbox( | |
| label="Use chat template", | |
| value=False, | |
| info="Is your model a chat model?", | |
| ) | |
| submit_button = gr.Button("Submit Eval") | |
| submission_result = gr.Markdown() | |
| submit_button.click( | |
| add_new_eval, | |
| [ | |
| model_name_textbox, | |
| base_model_name_textbox, | |
| revision_name_textbox, | |
| model_type, | |
| domain_specific_toggle, | |
| chat_template_toggle, | |
| precision, | |
| weight_type | |
| ], | |
| submission_result, | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("π Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=20, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", seconds=86400) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False) |