Spaces:
Sleeping
Sleeping
Paul Gavrikov
commited on
Commit
·
072b3c3
1
Parent(s):
7a5e704
updating judge logic
Browse files
judge.py
CHANGED
|
@@ -314,6 +314,8 @@ def judge(results_data, question_index):
|
|
| 314 |
|
| 315 |
accuracy_meters = defaultdict(AverageMeter)
|
| 316 |
|
|
|
|
|
|
|
| 317 |
# process counting data and compute accuracy and MAE
|
| 318 |
correct = 0
|
| 319 |
total = 0
|
|
@@ -344,7 +346,9 @@ def judge(results_data, question_index):
|
|
| 344 |
total += 1
|
| 345 |
|
| 346 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
| 347 |
-
|
|
|
|
|
|
|
| 348 |
# process OCR data and compute accuracy and ESD
|
| 349 |
|
| 350 |
correct = 0
|
|
@@ -376,6 +380,8 @@ def judge(results_data, question_index):
|
|
| 376 |
total += 1
|
| 377 |
|
| 378 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
|
|
|
|
|
|
| 379 |
|
| 380 |
# process multiple choice data without binary options and compute accuracy
|
| 381 |
|
|
@@ -398,6 +404,8 @@ def judge(results_data, question_index):
|
|
| 398 |
total += 1
|
| 399 |
|
| 400 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
|
|
|
|
|
|
| 401 |
|
| 402 |
# process binary choice data and compute accuracy
|
| 403 |
correct = 0
|
|
@@ -446,27 +454,21 @@ def judge(results_data, question_index):
|
|
| 446 |
if is_correct and is_opposite_correct:
|
| 447 |
correct += 1
|
| 448 |
accuracy_meters[benchmark_truth["source_file"]].update(1)
|
|
|
|
|
|
|
| 449 |
else:
|
| 450 |
opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
|
| 451 |
accuracy_meters[benchmark_truth["source_file"]].update(0)
|
|
|
|
|
|
|
| 452 |
total += 1
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
df_preds = pd.DataFrame(results_data).set_index("question_id")
|
| 456 |
-
df_gt = pd.DataFrame.from_dict(question_index).T.set_index("question_id")
|
| 457 |
-
df = df_preds.join(df_gt)
|
| 458 |
|
| 459 |
scores = {
|
| 460 |
"is_complete": len(non_answered_questions) == 0,
|
| 461 |
"is_excessive": len(excessive_answers) > 0,
|
| 462 |
**dict([
|
| 463 |
("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
|
| 464 |
-
])
|
| 465 |
-
"accuracy/easy": df.query("difficulty == 'easy'")["judge/correct"].mean(),
|
| 466 |
-
"accuracy/medium": df.query("difficulty == 'medium'")["judge/correct"].mean(),
|
| 467 |
-
"accuracy/hard": df.query("difficulty == 'hard'")["judge/correct"].mean(),
|
| 468 |
-
"accuracy/total": df["judge/correct"].mean(),
|
| 469 |
-
|
| 470 |
}
|
| 471 |
|
| 472 |
return results_data, scores
|
|
|
|
| 314 |
|
| 315 |
accuracy_meters = defaultdict(AverageMeter)
|
| 316 |
|
| 317 |
+
accuracy_meters["total"] = AverageMeter()
|
| 318 |
+
|
| 319 |
# process counting data and compute accuracy and MAE
|
| 320 |
correct = 0
|
| 321 |
total = 0
|
|
|
|
| 346 |
total += 1
|
| 347 |
|
| 348 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
| 349 |
+
accuracy_meters["total"].update(correct_count)
|
| 350 |
+
accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
|
| 351 |
+
|
| 352 |
# process OCR data and compute accuracy and ESD
|
| 353 |
|
| 354 |
correct = 0
|
|
|
|
| 380 |
total += 1
|
| 381 |
|
| 382 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
| 383 |
+
accuracy_meters["total"].update(correct_count)
|
| 384 |
+
accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
|
| 385 |
|
| 386 |
# process multiple choice data without binary options and compute accuracy
|
| 387 |
|
|
|
|
| 404 |
total += 1
|
| 405 |
|
| 406 |
accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
|
| 407 |
+
accuracy_meters["total"].update(correct_count)
|
| 408 |
+
accuracy_meters[benchmark_truth["difficulty"]].update(correct_count)
|
| 409 |
|
| 410 |
# process binary choice data and compute accuracy
|
| 411 |
correct = 0
|
|
|
|
| 454 |
if is_correct and is_opposite_correct:
|
| 455 |
correct += 1
|
| 456 |
accuracy_meters[benchmark_truth["source_file"]].update(1)
|
| 457 |
+
accuracy_meters["total"].update(1)
|
| 458 |
+
accuracy_meters[benchmark_truth["difficulty"]].update(1)
|
| 459 |
else:
|
| 460 |
opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
|
| 461 |
accuracy_meters[benchmark_truth["source_file"]].update(0)
|
| 462 |
+
accuracy_meters["total"].update(0)
|
| 463 |
+
accuracy_meters[benchmark_truth["difficulty"]].update(0)
|
| 464 |
total += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
scores = {
|
| 467 |
"is_complete": len(non_answered_questions) == 0,
|
| 468 |
"is_excessive": len(excessive_answers) > 0,
|
| 469 |
**dict([
|
| 470 |
("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
|
| 471 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
}
|
| 473 |
|
| 474 |
return results_data, scores
|