tathagataraha commited on
Commit
fb84311
·
1 Parent(s): 6e7d4aa

[ADD] Healthbench

Browse files
Files changed (5) hide show
  1. app.py +339 -221
  2. src/about.py +39 -7
  3. src/display/utils.py +21 -17
  4. src/leaderboard/read_evals.py +41 -41
  5. src/populate.py +5 -2
app.py CHANGED
@@ -14,6 +14,7 @@ from src.about import (
14
  LLM_BENCHMARKS_TEXT_2,
15
  CROSS_EVALUATION_METRICS,
16
  NOTE_GENERATION_METRICS,
 
17
  # EVALUATION_EXAMPLE_IMG,
18
  # LLM_BENCHMARKS_TEXT_2,
19
  # ENTITY_DISTRIBUTION_IMG,
@@ -31,14 +32,16 @@ from src.display.utils import (
31
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
  ACI_BENCHMARK_COLS,
33
  SOAP_BENCHMARK_COLS,
34
- CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
 
35
  DATASET_COLS,
36
  OPEN_ENDED_COLS,
37
  MED_SAFETY_COLS,
38
  MEDICAL_SUMMARIZATION_COLS,
39
  ACI_COLS,
40
  SOAP_COLS,
41
- CLOSED_ENDED_ARABIC_COLS,
 
42
  EVAL_COLS,
43
  EVAL_TYPES,
44
  NUMERIC_INTERVALS,
@@ -96,9 +99,12 @@ aci_leaderboard_df = aci_original_df.copy()
96
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
97
  soap_leaderboard_df = soap_original_df.copy()
98
 
99
- if PRIVATE_REPO:
100
- _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
101
- closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
 
 
 
102
 
103
  # breakpoint()
104
  # # Token based results
@@ -136,9 +142,12 @@ def update_df(shown_columns, subset="datasets"):
136
  elif subset == "soap":
137
  leaderboard_table_df = soap_leaderboard_df.copy()
138
  hidden_leader_board_df = soap_original_df
139
- elif PRIVATE_REPO and subset == "closed-ended-arabic":
140
- leaderboard_table_df = closed_ended_arabic_leaderboard_df.copy()
141
- hidden_leader_board_df = closed_ended_arabic_original_df
 
 
 
142
  # else:
143
  # match evaluation_metric:
144
  # case "Span Based":
@@ -380,121 +389,7 @@ with demo:
380
  system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
381
  with gr.Accordion("Scoring Rubric", open=False):
382
  system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
383
- with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
384
- with gr.Row():
385
- with gr.Column():
386
- with gr.Row():
387
- search_bar = gr.Textbox(
388
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
389
- show_label=False,
390
- elem_id="search-bar",
391
- )
392
- with gr.Row():
393
- shown_columns = gr.CheckboxGroup(
394
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
395
- value=[
396
- c.name
397
- for c in fields(AutoEvalColumn)
398
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
399
- ],
400
- label="Select columns to show",
401
- elem_id="column-select",
402
- interactive=True,
403
- )
404
- # with gr.Row():
405
- # deleted_models_visibility = gr.Checkbox(
406
- # value=False, label="Show gated/private/deleted models", interactive=True
407
- # )
408
- with gr.Column(min_width=320):
409
- # with gr.Box(elem_id="box-filter"):
410
- filter_columns_type = gr.CheckboxGroup(
411
- label="Model Types",
412
- choices=[t.to_str() for t in ModelType],
413
- value=[t.to_str() for t in ModelType],
414
- interactive=True,
415
- elem_id="filter-columns-type",
416
- )
417
- # filter_columns_architecture = gr.CheckboxGroup(
418
- # label="Architecture Types",
419
- # choices=[i.value.name for i in ModelArch],
420
- # value=[i.value.name for i in ModelArch],
421
- # interactive=True,
422
- # elem_id="filter-columns-architecture",
423
- # )
424
- filter_domain_specific = gr.CheckboxGroup(
425
- label="Domain Specificity",
426
- choices=["🏥 Clinical models", "Generic models"],
427
- value=["🏥 Clinical models", "Generic models"],
428
- interactive=True,
429
- elem_id="filter-columns-type",
430
- )
431
- filter_columns_size = gr.CheckboxGroup(
432
- label="Model sizes (in billions of parameters)",
433
- choices=list(NUMERIC_INTERVALS.keys()),
434
- value=list(NUMERIC_INTERVALS.keys()),
435
- interactive=True,
436
- elem_id="filter-columns-size",
437
- )
438
-
439
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
440
-
441
- leaderboard_table = gr.components.Dataframe(
442
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
443
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
444
- datatype=TYPES,
445
- elem_id="leaderboard-table",
446
- interactive=False,
447
- visible=True,
448
- )
449
-
450
- # Dummy leaderboard for handling the case when the user uses backspace key
451
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
452
- value=datasets_original_df[MED_SAFETY_COLS],
453
- headers=MED_SAFETY_COLS,
454
- datatype=TYPES,
455
- visible=False,
456
- )
457
-
458
-
459
- search_bar.submit(
460
- update_table,
461
- [
462
- hidden_leaderboard_table_for_search,
463
- shown_columns,
464
- search_bar,
465
- filter_columns_type,
466
- filter_domain_specific,
467
- filter_columns_size
468
- # filter_columns_architecture
469
- ],
470
- leaderboard_table,
471
- )
472
- for selector in [
473
- shown_columns,
474
- filter_columns_type,
475
- filter_domain_specific,
476
- filter_columns_size,
477
- # deleted_models_visibility,
478
- ]:
479
- selector.change(
480
- update_table,
481
- [
482
- hidden_leaderboard_table_for_search,
483
- shown_columns,
484
- search_bar,
485
- filter_columns_type,
486
- filter_domain_specific,
487
- filter_columns_size
488
- ],
489
- leaderboard_table,
490
- queue=True,
491
- )
492
- with gr.Accordion("💬 Generation templates", open=False):
493
- with gr.Accordion("Response generation", open=False):
494
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
495
- with gr.Accordion("Scoring Rubric", open=False):
496
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
497
- with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
498
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
499
  with gr.Row():
500
  with gr.Column():
@@ -611,7 +506,7 @@ with demo:
611
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
612
  with gr.Accordion("Cross Examination", open=False):
613
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
614
- with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
615
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
616
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
617
  with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
@@ -840,87 +735,107 @@ with demo:
840
  with gr.Accordion("Question generation", open=False):
841
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
842
  with gr.Accordion("Cross Examination", open=False):
843
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
844
- if PRIVATE_REPO:
845
- with gr.TabItem("Dev Evals", elem_id="llm-benchmark-tab-table", id=100):
846
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
847
- with gr.TabItem("🏅 Arabic Closed Ended Evaluation", elem_id="llm-benchmark-tab-table100", id=0):
848
- with gr.Row():
849
- with gr.Column():
850
- with gr.Row():
851
- search_bar = gr.Textbox(
852
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
853
- show_label=False,
854
- elem_id="search-bar",
855
- )
856
- with gr.Row():
857
- shown_columns = gr.CheckboxGroup(
858
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)],
859
- value=[
860
- c.name
861
- for c in fields(AutoEvalColumn)
862
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_arabic_col)
863
- ],
864
- label="Select columns to show",
865
- elem_id="column-select",
866
- interactive=True,
867
- )
868
- # with gr.Row():
869
- # deleted_models_visibility = gr.Checkbox(
870
- # value=False, label="Show gated/private/deleted models", interactive=True
871
- # )
872
- with gr.Column(min_width=320):
873
- # with gr.Box(elem_id="box-filter"):
874
- filter_columns_type = gr.CheckboxGroup(
875
- label="Model Types",
876
- choices=[t.to_str() for t in ModelType],
877
- value=[t.to_str() for t in ModelType],
878
- interactive=True,
879
- elem_id="filter-columns-type",
880
- )
881
- # filter_columns_architecture = gr.CheckboxGroup(
882
- # label="Architecture Types",
883
- # choices=[i.value.name for i in ModelArch],
884
- # value=[i.value.name for i in ModelArch],
885
- # interactive=True,
886
- # elem_id="filter-columns-architecture",
887
- # )
888
- filter_domain_specific = gr.CheckboxGroup(
889
- label="Domain Specificity",
890
- choices=["🏥 Clinical models", "Generic models"],
891
- value=["🏥 Clinical models", "Generic models"],
892
- interactive=True,
893
- elem_id="filter-columns-type",
894
  )
895
- filter_columns_size = gr.CheckboxGroup(
896
- label="Model sizes (in billions of parameters)",
897
- choices=list(NUMERIC_INTERVALS.keys()),
898
- value=list(NUMERIC_INTERVALS.keys()),
 
 
 
 
 
 
899
  interactive=True,
900
- elem_id="filter-columns-size",
901
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
- closed_ended_arabic_leaderboard_df, closed_ended_arabic_original_df = update_df(shown_columns.value, subset="closed-ended-arabic")
904
 
905
- leaderboard_table = gr.components.Dataframe(
906
- value=closed_ended_arabic_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
907
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
908
- datatype=TYPES,
909
- elem_id="leaderboard-table",
910
- interactive=False,
911
- visible=True,
912
- )
913
 
914
- # Dummy leaderboard for handling the case when the user uses backspace key
915
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
916
- value=closed_ended_arabic_original_df[CLOSED_ENDED_ARABIC_COLS],
917
- headers=CLOSED_ENDED_ARABIC_COLS,
918
- datatype=TYPES,
919
- visible=False,
920
- )
921
 
922
-
923
- search_bar.submit(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
924
  update_table,
925
  [
926
  hidden_leaderboard_table_for_search,
@@ -929,33 +844,236 @@ with demo:
929
  filter_columns_type,
930
  filter_domain_specific,
931
  filter_columns_size
932
- # filter_columns_architecture
933
  ],
934
  leaderboard_table,
 
935
  )
936
- for selector in [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
  shown_columns,
 
938
  filter_columns_type,
939
  filter_domain_specific,
940
- # filter_columns_architecture,
941
- filter_columns_size,
942
- # deleted_models_visibility,
943
- ]:
944
- selector.change(
945
- update_table,
946
- [
947
- hidden_leaderboard_table_for_search,
948
- shown_columns,
949
- search_bar,
950
- filter_columns_type,
951
- filter_domain_specific,
952
- filter_columns_size
953
- # filter_columns_architecture,
954
- ],
955
- leaderboard_table,
956
- queue=True,
957
- )
958
- with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
  with gr.Row():
960
  with gr.Column():
961
  with gr.Row():
@@ -1067,7 +1185,7 @@ with demo:
1067
  queue=True,
1068
  )
1069
 
1070
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=5):
1071
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1072
  gr.HTML(FIVE_PILLAR_DIAGRAM)
1073
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
@@ -1076,7 +1194,7 @@ with demo:
1076
  # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
1077
  # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
1078
 
1079
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=6):
1080
  with gr.Column():
1081
  with gr.Row():
1082
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
14
  LLM_BENCHMARKS_TEXT_2,
15
  CROSS_EVALUATION_METRICS,
16
  NOTE_GENERATION_METRICS,
17
+ HEALTHBENCH_METRICS,
18
  # EVALUATION_EXAMPLE_IMG,
19
  # LLM_BENCHMARKS_TEXT_2,
20
  # ENTITY_DISTRIBUTION_IMG,
 
32
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
33
  ACI_BENCHMARK_COLS,
34
  SOAP_BENCHMARK_COLS,
35
+ HEALTHBENCH_BENCHMARK_COLS,
36
+ HEALTHBENCH_HARD_BENCHMARK_COLS,
37
  DATASET_COLS,
38
  OPEN_ENDED_COLS,
39
  MED_SAFETY_COLS,
40
  MEDICAL_SUMMARIZATION_COLS,
41
  ACI_COLS,
42
  SOAP_COLS,
43
+ HEALTHBENCH_COLS,
44
+ HEALTHBENCH_HARD_COLS,
45
  EVAL_COLS,
46
  EVAL_TYPES,
47
  NUMERIC_INTERVALS,
 
99
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
100
  soap_leaderboard_df = soap_original_df.copy()
101
 
102
+ _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
103
+ healthbench_leaderboard_df = healthbench_original_df.copy()
104
+
105
+ _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
106
+ healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
107
+
108
 
109
  # breakpoint()
110
  # # Token based results
 
142
  elif subset == "soap":
143
  leaderboard_table_df = soap_leaderboard_df.copy()
144
  hidden_leader_board_df = soap_original_df
145
+ elif subset == "healthbench":
146
+ leaderboard_table_df = healthbench_leaderboard_df.copy()
147
+ hidden_leader_board_df = healthbench_original_df
148
+ elif subset == "healthbench_hard":
149
+ leaderboard_table_df = healthbench_hard_leaderboard_df.copy()
150
+ hidden_leader_board_df = healthbench_hard_original_df
151
  # else:
152
  # match evaluation_metric:
153
  # case "Span Based":
 
389
  system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
390
  with gr.Accordion("Scoring Rubric", open=False):
391
  system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
392
+ with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
394
  with gr.Row():
395
  with gr.Column():
 
506
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
507
  with gr.Accordion("Cross Examination", open=False):
508
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
509
+ with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=3):
510
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
511
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
512
  with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
 
735
  with gr.Accordion("Question generation", open=False):
736
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
737
  with gr.Accordion("Cross Examination", open=False):
738
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
739
+ with gr.TabItem("🏅 HealthBench", elem_id="llm-benchmark-tab-table", id=4):
740
+ gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
741
+ with gr.Tabs(elem_classes="tab-buttons2") as tabs:
742
+ with gr.TabItem("HealthBench", elem_id="llm-benchmark-tab-table3", id=0):
743
+ with gr.Row():
744
+ with gr.Column():
745
+ with gr.Row():
746
+ search_bar = gr.Textbox(
747
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
748
+ show_label=False,
749
+ elem_id="search-bar",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  )
751
+ with gr.Row():
752
+ shown_columns = gr.CheckboxGroup(
753
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
754
+ value=[
755
+ c.name
756
+ for c in fields(AutoEvalColumn)
757
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)
758
+ ],
759
+ label="Select columns to show",
760
+ elem_id="column-select",
761
  interactive=True,
 
762
  )
763
+ # with gr.Row():
764
+ # deleted_models_visibility = gr.Checkbox(
765
+ # value=False, label="Show gated/private/deleted models", interactive=True
766
+ # )
767
+ with gr.Column(min_width=320):
768
+ # with gr.Box(elem_id="box-filter"):
769
+ filter_columns_type = gr.CheckboxGroup(
770
+ label="Model Types",
771
+ choices=[t.to_str() for t in ModelType],
772
+ value=[t.to_str() for t in ModelType],
773
+ interactive=True,
774
+ elem_id="filter-columns-type",
775
+ )
776
+ # filter_columns_architecture = gr.CheckboxGroup(
777
+ # label="Architecture Types",
778
+ # choices=[i.value.name for i in ModelArch],
779
+ # value=[i.value.name for i in ModelArch],
780
+ # interactive=True,
781
+ # elem_id="filter-columns-architecture",
782
+ # )
783
+ filter_domain_specific = gr.CheckboxGroup(
784
+ label="Domain Specificity",
785
+ choices=["🏥 Clinical models", "Generic models"],
786
+ value=["🏥 Clinical models", "Generic models"],
787
+ interactive=True,
788
+ elem_id="filter-columns-type",
789
+ )
790
+ filter_columns_size = gr.CheckboxGroup(
791
+ label="Model sizes (in billions of parameters)",
792
+ choices=list(NUMERIC_INTERVALS.keys()),
793
+ value=list(NUMERIC_INTERVALS.keys()),
794
+ interactive=True,
795
+ elem_id="filter-columns-size",
796
+ )
797
 
798
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench")
799
 
800
+ leaderboard_table = gr.components.Dataframe(
801
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
802
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
803
+ datatype=TYPES,
804
+ elem_id="leaderboard-table",
805
+ interactive=False,
806
+ visible=True,
807
+ )
808
 
809
+ # Dummy leaderboard for handling the case when the user uses backspace key
810
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
811
+ value=datasets_original_df[HEALTHBENCH_COLS],
812
+ headers=HEALTHBENCH_COLS,
813
+ datatype=TYPES,
814
+ visible=False,
815
+ )
816
 
817
+
818
+ search_bar.submit(
819
+ update_table,
820
+ [
821
+ hidden_leaderboard_table_for_search,
822
+ shown_columns,
823
+ search_bar,
824
+ filter_columns_type,
825
+ filter_domain_specific,
826
+ filter_columns_size
827
+ # filter_columns_architecture
828
+ ],
829
+ leaderboard_table,
830
+ )
831
+ for selector in [
832
+ shown_columns,
833
+ filter_columns_type,
834
+ filter_domain_specific,
835
+ filter_columns_size,
836
+ # deleted_models_visibility,
837
+ ]:
838
+ selector.change(
839
  update_table,
840
  [
841
  hidden_leaderboard_table_for_search,
 
844
  filter_columns_type,
845
  filter_domain_specific,
846
  filter_columns_size
 
847
  ],
848
  leaderboard_table,
849
+ queue=True,
850
  )
851
+ with gr.TabItem("HealthBench-Hard", elem_id="llm-benchmark-tab-table3", id=1):
852
+ with gr.Row():
853
+ with gr.Column():
854
+ with gr.Row():
855
+ search_bar = gr.Textbox(
856
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
857
+ show_label=False,
858
+ elem_id="search-bar",
859
+ )
860
+ with gr.Row():
861
+ shown_columns = gr.CheckboxGroup(
862
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
863
+ value=[
864
+ c.name
865
+ for c in fields(AutoEvalColumn)
866
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)
867
+ ],
868
+ label="Select columns to show",
869
+ elem_id="column-select",
870
+ interactive=True,
871
+ )
872
+ # with gr.Row():
873
+ # deleted_models_visibility = gr.Checkbox(
874
+ # value=False, label="Show gated/private/deleted models", interactive=True
875
+ # )
876
+ with gr.Column(min_width=320):
877
+ # with gr.Box(elem_id="box-filter"):
878
+ filter_columns_type = gr.CheckboxGroup(
879
+ label="Model Types",
880
+ choices=[t.to_str() for t in ModelType],
881
+ value=[t.to_str() for t in ModelType],
882
+ interactive=True,
883
+ elem_id="filter-columns-type",
884
+ )
885
+ # filter_columns_architecture = gr.CheckboxGroup(
886
+ # label="Architecture Types",
887
+ # choices=[i.value.name for i in ModelArch],
888
+ # value=[i.value.name for i in ModelArch],
889
+ # interactive=True,
890
+ # elem_id="filter-columns-architecture",
891
+ # )
892
+ filter_domain_specific = gr.CheckboxGroup(
893
+ label="Domain Specificity",
894
+ choices=["🏥 Clinical models", "Generic models"],
895
+ value=["🏥 Clinical models", "Generic models"],
896
+ interactive=True,
897
+ elem_id="filter-columns-type",
898
+ )
899
+ filter_columns_size = gr.CheckboxGroup(
900
+ label="Model sizes (in billions of parameters)",
901
+ choices=list(NUMERIC_INTERVALS.keys()),
902
+ value=list(NUMERIC_INTERVALS.keys()),
903
+ interactive=True,
904
+ elem_id="filter-columns-size",
905
+ )
906
+
907
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench_hard")
908
+
909
+ leaderboard_table = gr.components.Dataframe(
910
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
911
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
912
+ datatype=TYPES,
913
+ elem_id="leaderboard-table",
914
+ interactive=False,
915
+ visible=True,
916
+ )
917
+
918
+ # Dummy leaderboard for handling the case when the user uses backspace key
919
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
920
+ value=datasets_original_df[HEALTHBENCH_HARD_COLS],
921
+ headers=HEALTHBENCH_HARD_COLS,
922
+ datatype=TYPES,
923
+ visible=False,
924
+ )
925
+
926
+
927
+ search_bar.submit(
928
+ update_table,
929
+ [
930
+ hidden_leaderboard_table_for_search,
931
  shown_columns,
932
+ search_bar,
933
  filter_columns_type,
934
  filter_domain_specific,
935
+ filter_columns_size
936
+ # filter_columns_architecture
937
+ ],
938
+ leaderboard_table,
939
+ )
940
+ for selector in [
941
+ shown_columns,
942
+ filter_columns_type,
943
+ filter_domain_specific,
944
+ filter_columns_size,
945
+ # deleted_models_visibility,
946
+ ]:
947
+ selector.change(
948
+ update_table,
949
+ [
950
+ hidden_leaderboard_table_for_search,
951
+ shown_columns,
952
+ search_bar,
953
+ filter_columns_type,
954
+ filter_domain_specific,
955
+ filter_columns_size
956
+ ],
957
+ leaderboard_table,
958
+ queue=True,
959
+ )
960
+
961
+ with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=5):
962
+ with gr.Row():
963
+ with gr.Column():
964
+ with gr.Row():
965
+ search_bar = gr.Textbox(
966
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
967
+ show_label=False,
968
+ elem_id="search-bar",
969
+ )
970
+ with gr.Row():
971
+ shown_columns = gr.CheckboxGroup(
972
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
973
+ value=[
974
+ c.name
975
+ for c in fields(AutoEvalColumn)
976
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
977
+ ],
978
+ label="Select columns to show",
979
+ elem_id="column-select",
980
+ interactive=True,
981
+ )
982
+ # with gr.Row():
983
+ # deleted_models_visibility = gr.Checkbox(
984
+ # value=False, label="Show gated/private/deleted models", interactive=True
985
+ # )
986
+ with gr.Column(min_width=320):
987
+ # with gr.Box(elem_id="box-filter"):
988
+ filter_columns_type = gr.CheckboxGroup(
989
+ label="Model Types",
990
+ choices=[t.to_str() for t in ModelType],
991
+ value=[t.to_str() for t in ModelType],
992
+ interactive=True,
993
+ elem_id="filter-columns-type",
994
+ )
995
+ # filter_columns_architecture = gr.CheckboxGroup(
996
+ # label="Architecture Types",
997
+ # choices=[i.value.name for i in ModelArch],
998
+ # value=[i.value.name for i in ModelArch],
999
+ # interactive=True,
1000
+ # elem_id="filter-columns-architecture",
1001
+ # )
1002
+ filter_domain_specific = gr.CheckboxGroup(
1003
+ label="Domain Specificity",
1004
+ choices=["🏥 Clinical models", "Generic models"],
1005
+ value=["🏥 Clinical models", "Generic models"],
1006
+ interactive=True,
1007
+ elem_id="filter-columns-type",
1008
+ )
1009
+ filter_columns_size = gr.CheckboxGroup(
1010
+ label="Model sizes (in billions of parameters)",
1011
+ choices=list(NUMERIC_INTERVALS.keys()),
1012
+ value=list(NUMERIC_INTERVALS.keys()),
1013
+ interactive=True,
1014
+ elem_id="filter-columns-size",
1015
+ )
1016
+
1017
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
1018
+
1019
+ leaderboard_table = gr.components.Dataframe(
1020
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1021
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1022
+ datatype=TYPES,
1023
+ elem_id="leaderboard-table",
1024
+ interactive=False,
1025
+ visible=True,
1026
+ )
1027
+
1028
+ # Dummy leaderboard for handling the case when the user uses backspace key
1029
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1030
+ value=datasets_original_df[MED_SAFETY_COLS],
1031
+ headers=MED_SAFETY_COLS,
1032
+ datatype=TYPES,
1033
+ visible=False,
1034
+ )
1035
+
1036
+
1037
+ search_bar.submit(
1038
+ update_table,
1039
+ [
1040
+ hidden_leaderboard_table_for_search,
1041
+ shown_columns,
1042
+ search_bar,
1043
+ filter_columns_type,
1044
+ filter_domain_specific,
1045
+ filter_columns_size
1046
+ # filter_columns_architecture
1047
+ ],
1048
+ leaderboard_table,
1049
+ )
1050
+ for selector in [
1051
+ shown_columns,
1052
+ filter_columns_type,
1053
+ filter_domain_specific,
1054
+ filter_columns_size,
1055
+ # deleted_models_visibility,
1056
+ ]:
1057
+ selector.change(
1058
+ update_table,
1059
+ [
1060
+ hidden_leaderboard_table_for_search,
1061
+ shown_columns,
1062
+ search_bar,
1063
+ filter_columns_type,
1064
+ filter_domain_specific,
1065
+ filter_columns_size
1066
+ ],
1067
+ leaderboard_table,
1068
+ queue=True,
1069
+ )
1070
+ with gr.Accordion("💬 Generation templates", open=False):
1071
+ with gr.Accordion("Response generation", open=False):
1072
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
1073
+ with gr.Accordion("Scoring Rubric", open=False):
1074
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
1075
+
1076
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=6):
1077
  with gr.Row():
1078
  with gr.Column():
1079
  with gr.Row():
 
1185
  queue=True,
1186
  )
1187
 
1188
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=7):
1189
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1190
  gr.HTML(FIVE_PILLAR_DIAGRAM)
1191
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
 
1194
  # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
1195
  # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
1196
 
1197
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=8):
1198
  with gr.Column():
1199
  with gr.Row():
1200
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -97,22 +97,50 @@ class SOAPColumns(Enum):
97
  # soap_column3 = SOAPColumn("brief", "score", "Conciseness")
98
 
99
  @dataclass
100
- class ClosedEndedArabicColumn:
101
  benchmark: str
102
  metric: str
103
  col_name: str
104
 
105
- class ClosedEndedArabicColumns(Enum):
106
- arabictask0 = ClosedEndedArabicColumn("MMLU-Arabic", "accuracy", "MMLU-Arabic")
107
- arabictask2 = ClosedEndedArabicColumn("MedMCQA-Arabic", "accuracy", "MedMCQA-Arabic")
108
- arabictask3 = ClosedEndedArabicColumn("MedQA-Arabic", "accuracy", "MedQA-Arabic")
109
- arabictask5 = ClosedEndedArabicColumn("PubMedQA-Arabic", "accuracy", "PubMedQA-Arabic")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  NUM_FEWSHOT = 0 # Change with your few shot
113
  # ---------------------------------------------------
114
 
115
-
116
  # Your leaderboard name
117
  TITLE = """<h1 align="center" id="space-title" style="color: red;"> [DEV Mode] </h1>"""
118
  # LOGO = """<img src="https://equalengineers.com/wp-content/uploads/2024/04/dummy-logo-5b.png" alt="Clinical X HF" width="500" height="333">"""
@@ -240,6 +268,10 @@ CROSS_EVALUATION_METRICS = """
240
  - **Overall Score**: The average of coverage, conformity, consistency, and the harmonic mean of coverage and conciseness (if both are positive, otherwise 0).
241
  """
242
 
 
 
 
 
243
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
244
  CITATION_BUTTON_TEXT = r"""
245
  @misc{kanithi2024mediccomprehensiveframeworkevaluating,
 
97
  # soap_column3 = SOAPColumn("brief", "score", "Conciseness")
98
 
99
  @dataclass
100
+ class HealthbenchColumn:
101
  benchmark: str
102
  metric: str
103
  col_name: str
104
 
105
+ class HealthbenchColumns(Enum):
106
+ healthbench_column0 = HealthbenchColumn("Overall Score", "score", "Overall Score")
107
+ healthbench_column2 = HealthbenchColumn("Responding under uncertainty", "score", "Responding under uncertainty")
108
+ healthbench_column3 = HealthbenchColumn("Health data tasks", "score", "Health data tasks")
109
+ healthbench_column4 = HealthbenchColumn("Global health", "score", "Global health")
110
+ healthbench_column5 = HealthbenchColumn("Expertise-tailored communication", "score", "Expertise-tailored communication")
111
+ healthbench_column6 = HealthbenchColumn("Context seeking", "score", "Context seeking")
112
+ healthbench_column7 = HealthbenchColumn("Emergency referrals", "score", "Emergency referrals")
113
+ healthbench_column8 = HealthbenchColumn("Response depth", "score", "Response depth")
114
+ healthbench_column9 = HealthbenchColumn("Axis: Completeness", "score", "Axis: Completeness")
115
+ healthbench_column10 = HealthbenchColumn("Axis: Context awareness", "score", "Axis: Context awareness")
116
+ healthbench_column11 = HealthbenchColumn("Axis: Accuracy", "score", "Axis: Accuracy")
117
+ healthbench_column12 = HealthbenchColumn("Axis: Instruction following", "score", "Axis: Instruction following")
118
+ healthbench_column13 = HealthbenchColumn("Axis: Communication quality", "score", "Axis: Communication quality")
119
+
120
+ @dataclass
121
+ class HealthbenchHardColumn:
122
+ benchmark: str
123
+ metric: str
124
+ col_name: str
125
 
126
+ class HealthbenchHardColumns(Enum):
127
+ healthbench_hard_column0 = HealthbenchHardColumn("Overall Score", "score", "Overall Score")
128
+ healthbench_hard_column2 = HealthbenchHardColumn("Responding under uncertainty", "score", "Responding under uncertainty")
129
+ healthbench_hard_column3 = HealthbenchHardColumn("Health data tasks", "score", "Health data tasks")
130
+ healthbench_hard_column4 = HealthbenchHardColumn("Global health", "score", "Global health")
131
+ healthbench_hard_column5 = HealthbenchHardColumn("Expertise-tailored communication", "score", "Expertise-tailored communication")
132
+ healthbench_hard_column6 = HealthbenchHardColumn("Context seeking", "score", "Context seeking")
133
+ healthbench_hard_column7 = HealthbenchHardColumn("Emergency referrals", "score", "Emergency referrals")
134
+ healthbench_hard_column8 = HealthbenchHardColumn("Response depth", "score", "Response depth")
135
+ healthbench_hard_column9 = HealthbenchHardColumn("Axis: Completeness", "score", "Axis: Completeness")
136
+ healthbench_hard_column10 = HealthbenchHardColumn("Axis: Context awareness", "score", "Axis: Context awareness")
137
+ healthbench_hard_column11 = HealthbenchHardColumn("Axis: Accuracy", "score", "Axis: Accuracy")
138
+ healthbench_hard_column12 = HealthbenchHardColumn("Axis: Instruction following", "score", "Axis: Instruction following")
139
+ healthbench_hard_column13 = HealthbenchHardColumn("Axis: Communication quality", "score", "Axis: Communication quality")
140
 
141
  NUM_FEWSHOT = 0 # Change with your few shot
142
  # ---------------------------------------------------
143
 
 
144
  # Your leaderboard name
145
  TITLE = """<h1 align="center" id="space-title" style="color: red;"> [DEV Mode] </h1>"""
146
  # LOGO = """<img src="https://equalengineers.com/wp-content/uploads/2024/04/dummy-logo-5b.png" alt="Clinical X HF" width="500" height="333">"""
 
268
  - **Overall Score**: The average of coverage, conformity, consistency, and the harmonic mean of coverage and conciseness (if both are positive, otherwise 0).
269
  """
270
 
271
+ HEALTHBENCH_METRICS = """
272
+ OpenAI HealthBench
273
+ """
274
+
275
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
276
  CITATION_BUTTON_TEXT = r"""
277
  @misc{kanithi2024mediccomprehensiveframeworkevaluating,
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
@@ -32,6 +32,8 @@ class ColumnContent:
32
  aci_col: bool = False
33
  soap_col: bool = False
34
  closed_ended_arabic_col: bool = False
 
 
35
 
36
 
37
  ## Leaderboard columns
@@ -59,9 +61,18 @@ for column in ACIColumns:
59
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
60
  for column in SOAPColumns:
61
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
62
- # if PRIVATE_REPO:
63
- for column in ClosedEndedArabicColumns:
64
- auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_arabic_col=True, invariant=False)])
 
 
 
 
 
 
 
 
 
65
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
66
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
67
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -94,9 +105,7 @@ class EvalQueueColumn: # Queue column
94
  med_safety_status = ColumnContent("med_safety_status", "str", True)
95
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
96
  note_generation_status = ColumnContent("note_generation_status", "str", True)
97
- if PRIVATE_REPO:
98
- closed_ended_arabic_status = ColumnContent("closed_ended_arabic_status", "str", True)
99
-
100
  ## All the model information that we might need
101
  @dataclass
102
  class ModelDetails:
@@ -221,13 +230,9 @@ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c
221
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
222
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
223
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
224
- # if PRIVATE_REPO:
225
- CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
226
- # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
227
- # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
228
- # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
229
- # MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
230
- # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
231
 
232
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
233
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -243,9 +248,8 @@ MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
243
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
244
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
245
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
246
- # if PRIVATE_REPO:
247
- CLOSED_ENDED_ARABIC_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedArabicColumns]
248
- # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
249
 
250
  NUMERIC_INTERVALS = {
251
  "?": pd.Interval(-100, 0, closed="right"),
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
 
32
  aci_col: bool = False
33
  soap_col: bool = False
34
  closed_ended_arabic_col: bool = False
35
+ healthbench_col: bool = False
36
+ healthbench_hard_col: bool = False
37
 
38
 
39
  ## Leaderboard columns
 
61
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
62
  for column in SOAPColumns:
63
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
64
+
65
+ for column in HealthbenchColumns:
66
+ if column.value.col_name.startswith("Axis"):
67
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_col=True, invariant=False)])
68
+ else:
69
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_col=True, invariant=False)])
70
+ for column in HealthbenchHardColumns:
71
+ if column.value.col_name.startswith("Axis"):
72
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
73
+ else:
74
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
75
+
76
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
77
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
78
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
105
  med_safety_status = ColumnContent("med_safety_status", "str", True)
106
  medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
107
  note_generation_status = ColumnContent("note_generation_status", "str", True)
108
+
 
 
109
  ## All the model information that we might need
110
  @dataclass
111
  class ModelDetails:
 
230
  MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
231
  ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
232
  SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
233
+ HEALTHBENCH_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_col or c.invariant)]
234
+ HEALTHBENCH_HARD_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_hard_col or c.invariant)]
235
+
 
 
 
 
236
 
237
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
238
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
248
  MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
249
  ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
250
  SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
251
+ HEALTHBENCH_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchColumns]
252
+ HEALTHBENCH_HARD_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchHardColumns]
 
253
 
254
  NUMERIC_INTERVALS = {
255
  "?": pd.Interval(-100, 0, closed="right"),
src/leaderboard/read_evals.py CHANGED
@@ -9,11 +9,10 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
16
-
17
  @dataclass
18
  class EvalResult:
19
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
@@ -30,7 +29,8 @@ class EvalResult:
30
  medical_summarization_results: dict
31
  aci_results: dict
32
  soap_results: dict
33
- closed_ended_arabic_results: dict
 
34
  is_domain_specific: bool
35
  use_chat_template: bool
36
  # clinical_type_results:dict
@@ -167,39 +167,35 @@ class EvalResult:
167
  continue
168
  mean_acc = np.mean(accs) # * 100.0
169
  soap_results[task.benchmark] = mean_acc
170
- closed_ended_arabic_results = {}
171
- if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
172
- for task in ClosedEndedArabicColumns:
 
173
  task = task.value
174
- # We average all scores of a given metric (not all metrics are present in all files)
175
- try:
176
- accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
177
- except:
178
- # breakpoint()
179
- accs = np.array([])
180
- if accs.size == 0 or any([acc is None for acc in accs]):
181
- continue
182
- mean_acc = np.mean(accs) # * 100.0
183
- closed_ended_arabic_results[task.benchmark] = mean_acc
184
- # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
185
- # open_ended_results = {}
186
- # med_safety_results = {}
187
- # medical_summarization_results = {}
188
- # aci_results = {}
189
- # soap_results = {}
190
- # types_results = {}
191
- # for clinical_type in ClinicalTypes:
192
- # clinical_type = clinical_type.value
193
-
194
- # # We average all scores of a given metric (not all metrics are present in all files)
195
- # accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
196
- # if accs.size == 0 or any([acc is None for acc in accs]):
197
- # continue
198
 
199
- # mean_acc = np.mean(accs) # * 100.0
200
- # types_results[clinical_type.benchmark] = mean_acc
201
- # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
202
- # breakpoint()
203
  return self(
204
  eval_name=result_key,
205
  full_model=full_model,
@@ -212,7 +208,8 @@ class EvalResult:
212
  medical_summarization_results=medical_summarization_results,
213
  aci_results=aci_results,
214
  soap_results=soap_results,
215
- closed_ended_arabic_results=closed_ended_arabic_results,
 
216
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
217
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
218
  precision=precision,
@@ -315,12 +312,15 @@ class EvalResult:
315
  for task in SOAPColumns:
316
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
317
  return data_dict
318
- if PRIVATE_REPO and subset == "closed_ended_arabic":
319
- average = sum([v for v in self.closed_ended_arabic_results.values() if v is not None]) / len(ClosedEndedArabicColumns)
320
- data_dict[AutoEvalColumn.average.name] = average
321
- if len(self.closed_ended_arabic_results) > 0:
322
- for task in ClosedEndedArabicColumns:
323
- data_dict[task.value.col_name] = self.closed_ended_arabic_results[task.value.benchmark]
 
 
 
324
  return data_dict
325
 
326
  def get_request_file_for_model(requests_path, model_name, precision):
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
 
16
  @dataclass
17
  class EvalResult:
18
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
 
29
  medical_summarization_results: dict
30
  aci_results: dict
31
  soap_results: dict
32
+ healthbench_results: dict
33
+ healthbench_hard_results: dict
34
  is_domain_specific: bool
35
  use_chat_template: bool
36
  # clinical_type_results:dict
 
167
  continue
168
  mean_acc = np.mean(accs) # * 100.0
169
  soap_results[task.benchmark] = mean_acc
170
+
171
+ healthbench_results = {}
172
+ if "healthbench" in data["results"]:
173
+ for task in HealthbenchColumns:
174
  task = task.value
175
+ if task.benchmark == "Overall Score":
176
+ accs = data["results"]["healthbench"][task.benchmark]
177
+ healthbench_results[task.benchmark] = accs
178
+ elif task.benchmark.startswith("Axis"):
179
+ accs = data["results"]["healthbench"]["Axis Scores"][task.benchmark.replace("Axis: ", "")]
180
+ healthbench_results[task.benchmark] = accs
181
+ else:
182
+ accs = data["results"]["healthbench"]["Theme Scores"][task.benchmark]
183
+ healthbench_results[task.benchmark] = accs
184
+
185
+ healthbench_hard_results = {}
186
+ if "healthbench-hard" in data["results"]:
187
+ for task in HealthbenchHardColumns:
188
+ task = task.value
189
+ if task.benchmark == "Overall Score":
190
+ accs = data["results"]["healthbench-hard"][task.benchmark]
191
+ healthbench_hard_results[task.benchmark] = accs
192
+ elif task.benchmark.startswith("Axis"):
193
+ accs = data["results"]["healthbench-hard"]["Axis Scores"][task.benchmark.replace("Axis: ", "")]
194
+ healthbench_hard_results[task.benchmark] = accs
195
+ else:
196
+ accs = data["results"]["healthbench-hard"]["Theme Scores"][task.benchmark]
197
+ healthbench_hard_results[task.benchmark] = accs
 
198
 
 
 
 
 
199
  return self(
200
  eval_name=result_key,
201
  full_model=full_model,
 
208
  medical_summarization_results=medical_summarization_results,
209
  aci_results=aci_results,
210
  soap_results=soap_results,
211
+ healthbench_results=healthbench_results,
212
+ healthbench_hard_results=healthbench_hard_results,
213
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
214
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
215
  precision=precision,
 
312
  for task in SOAPColumns:
313
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
314
  return data_dict
315
+ if subset == "healthbench":
316
+ if len(self.healthbench_results) > 0:
317
+ for task in HealthbenchColumns:
318
+ data_dict[task.value.col_name] = self.healthbench_results[task.value.benchmark]
319
+ return data_dict
320
+ if subset == "healthbench_hard":
321
+ if len(self.healthbench_hard_results) > 0:
322
+ for task in HealthbenchHardColumns:
323
+ data_dict[task.value.col_name] = self.healthbench_hard_results[task.value.benchmark]
324
  return data_dict
325
 
326
  def get_request_file_for_model(requests_path, model_name, precision):
src/populate.py CHANGED
@@ -5,11 +5,10 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, ClosedEndedArabicColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
12
-
13
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
14
  """Creates a dataframe from all the individual experiment results"""
15
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
@@ -33,6 +32,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
33
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
34
  elif subset == "closed_ended_arabic":
35
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
 
36
  cols = list(set(df.columns).intersection(set(cols)))
37
  df = df[cols].round(decimals=2)
38
  # filter out if any of the benchmarks have not been produced
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
 
32
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
33
  elif subset == "closed_ended_arabic":
34
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
35
+ elif subset == "healthbench":
36
+ df = df.sort_values(by=["Overall Score"], ascending=False)
37
+ elif subset == "healthbench_hard":
38
+ df = df.sort_values(by=["Overall Score"], ascending=False)
39
  cols = list(set(df.columns).intersection(set(cols)))
40
  df = df[cols].round(decimals=2)
41
  # filter out if any of the benchmarks have not been produced