Paul Gavrikov commited on
Commit
072b3c3
·
1 Parent(s): 7a5e704

updating judge logic

Browse files
Files changed (1) hide show
  1. judge.py +14 -12
judge.py CHANGED
@@ -314,6 +314,8 @@ def judge(results_data, question_index):
314
 
315
  accuracy_meters = defaultdict(AverageMeter)
316
 
 
 
317
  # process counting data and compute accuracy and MAE
318
  correct = 0
319
  total = 0
@@ -344,7 +346,9 @@ def judge(results_data, question_index):
344
  total += 1
345
 
346
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
347
-
 
 
348
  # process OCR data and compute accuracy and ESD
349
 
350
  correct = 0
@@ -376,6 +380,8 @@ def judge(results_data, question_index):
376
  total += 1
377
 
378
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
 
 
379
 
380
  # process multiple choice data without binary options and compute accuracy
381
 
@@ -398,6 +404,8 @@ def judge(results_data, question_index):
398
  total += 1
399
 
400
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
 
 
401
 
402
  # process binary choice data and compute accuracy
403
  correct = 0
@@ -446,27 +454,21 @@ def judge(results_data, question_index):
446
  if is_correct and is_opposite_correct:
447
  correct += 1
448
  accuracy_meters[benchmark_truth["source_file"]].update(1)
 
 
449
  else:
450
  opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
451
  accuracy_meters[benchmark_truth["source_file"]].update(0)
 
 
452
  total += 1
453
-
454
-
455
- df_preds = pd.DataFrame(results_data).set_index("question_id")
456
- df_gt = pd.DataFrame.from_dict(question_index).T.set_index("question_id")
457
- df = df_preds.join(df_gt)
458
 
459
  scores = {
460
  "is_complete": len(non_answered_questions) == 0,
461
  "is_excessive": len(excessive_answers) > 0,
462
  **dict([
463
  ("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
464
- ]),
465
- "accuracy/easy": df.query("difficulty == 'easy'")["judge/correct"].mean(),
466
- "accuracy/medium": df.query("difficulty == 'medium'")["judge/correct"].mean(),
467
- "accuracy/hard": df.query("difficulty == 'hard'")["judge/correct"].mean(),
468
- "accuracy/total": df["judge/correct"].mean(),
469
-
470
  }
471
 
472
  return results_data, scores
 
314
 
315
  accuracy_meters = defaultdict(AverageMeter)
316
 
317
+ accuracy_meters["total"] = AverageMeter()
318
+
319
  # process counting data and compute accuracy and MAE
320
  correct = 0
321
  total = 0
 
346
  total += 1
347
 
348
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
349
+ accuracy_meters["total"].update(correct_count)
350
+ accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
351
+
352
  # process OCR data and compute accuracy and ESD
353
 
354
  correct = 0
 
380
  total += 1
381
 
382
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
383
+ accuracy_meters["total"].update(correct_count)
384
+ accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
385
 
386
  # process multiple choice data without binary options and compute accuracy
387
 
 
404
  total += 1
405
 
406
  accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
407
+ accuracy_meters["total"].update(correct_count)
408
+ accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
409
 
410
  # process binary choice data and compute accuracy
411
  correct = 0
 
454
  if is_correct and is_opposite_correct:
455
  correct += 1
456
  accuracy_meters[benchmark_truth["source_file"]].update(1)
457
+ accuracy_meters["total"].update(1)
458
+ accuracy_meters[benchmark_truth["difficulty"]].update(1)
459
  else:
460
  opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
461
  accuracy_meters[benchmark_truth["source_file"]].update(0)
462
+ accuracy_meters["total"].update(0)
463
+ accuracy_meters[benchmark_truth["difficulty"]].update(0)
464
  total += 1
 
 
 
 
 
465
 
466
  scores = {
467
  "is_complete": len(non_answered_questions) == 0,
468
  "is_excessive": len(excessive_answers) > 0,
469
  **dict([
470
  ("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
471
+ ])
 
 
 
 
 
472
  }
473
 
474
  return results_data, scores