Spaces:

paulgavrikov
/

visualoverload-submit

Sleeping

App Files Files

Paul Gavrikov commited on Sep 29

Commit

072b3c3

1 Parent(s): 7a5e704

updating judge logic

Browse files

Files changed (1) hide show

judge.py +14 -12

judge.py CHANGED Viewed

@@ -314,6 +314,8 @@ def judge(results_data, question_index):
     accuracy_meters = defaultdict(AverageMeter)
     # process counting data and compute accuracy and MAE
     correct = 0
     total = 0
@@ -344,7 +346,9 @@ def judge(results_data, question_index):
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
     # process OCR data and compute accuracy and ESD
     correct = 0
@@ -376,6 +380,8 @@ def judge(results_data, question_index):
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
     # process multiple choice data without binary options and compute accuracy
@@ -398,6 +404,8 @@ def judge(results_data, question_index):
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
     # process binary choice data and compute accuracy
     correct = 0
@@ -446,27 +454,21 @@ def judge(results_data, question_index):
         if is_correct and is_opposite_correct:
             correct += 1
             accuracy_meters[benchmark_truth["source_file"]].update(1)
         else:
             opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
             accuracy_meters[benchmark_truth["source_file"]].update(0)
         total += 1
-    df_preds = pd.DataFrame(results_data).set_index("question_id")
-    df_gt = pd.DataFrame.from_dict(question_index).T.set_index("question_id")
-    df = df_preds.join(df_gt)
     scores = {
         "is_complete": len(non_answered_questions) == 0,
         "is_excessive": len(excessive_answers) > 0,
         **dict([
             ("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
-        ]),
-        "accuracy/easy": df.query("difficulty == 'easy'")["judge/correct"].mean(),
-        "accuracy/medium": df.query("difficulty == 'medium'")["judge/correct"].mean(),
-        "accuracy/hard": df.query("difficulty == 'hard'")["judge/correct"].mean(),
-        "accuracy/total": df["judge/correct"].mean(),
     }
     return results_data, scores

     accuracy_meters = defaultdict(AverageMeter)
+    accuracy_meters["total"] = AverageMeter()
     # process counting data and compute accuracy and MAE
     correct = 0
     total = 0
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+        accuracy_meters["total"].update(correct_count)
+        accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
     # process OCR data and compute accuracy and ESD
     correct = 0
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+        accuracy_meters["total"].update(correct_count)
+        accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
     # process multiple choice data without binary options and compute accuracy
         total += 1
         accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+        accuracy_meters["total"].update(correct_count)
+        accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
     # process binary choice data and compute accuracy
     correct = 0
         if is_correct and is_opposite_correct:
             correct += 1
             accuracy_meters[benchmark_truth["source_file"]].update(1)
+            accuracy_meters["total"].update(1)
+            accuracy_meters[benchmark_truth["difficulty"]].update(1)
         else:
             opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
             accuracy_meters[benchmark_truth["source_file"]].update(0)
+            accuracy_meters["total"].update(0)
+            accuracy_meters[benchmark_truth["difficulty"]].update(0)
         total += 1
     scores = {
         "is_complete": len(non_answered_questions) == 0,
         "is_excessive": len(excessive_answers) > 0,
         **dict([
             ("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
+        ])
     }
     return results_data, scores