I want a website displaying cards with info for different evaluations. When i clink the card it flips and i get more info.
Browse filesIt's LLM evaluation, think gsm8k, gpqa etc.. The cards contain info about the benchmarks themselves. for example: name, type of eval, dataset used, small description, example
- README.md +7 -4
- index.html +145 -19
README.md
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: π
|
| 4 |
colorFrom: red
|
| 5 |
-
colorTo:
|
|
|
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: EvalFlip - AI Benchmark Universe π
|
|
|
|
| 3 |
colorFrom: red
|
| 4 |
+
colorTo: purple
|
| 5 |
+
emoji: π³
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
| 8 |
+
tags:
|
| 9 |
+
- deepsite-v3
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Welcome to your new DeepSite project!
|
| 13 |
+
This project was created with [DeepSite](https://huggingface.co/deepsite).
|
index.html
CHANGED
|
@@ -1,19 +1,145 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>EvalFlip - AI Benchmark Universe</title>
|
| 7 |
+
<link rel="stylesheet" href="style.css">
|
| 8 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 9 |
+
<script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script>
|
| 10 |
+
<script src="https://unpkg.com/feather-icons"></script>
|
| 11 |
+
<script src="components/navbar.js"></script>
|
| 12 |
+
<script src="components/footer.js"></script>
|
| 13 |
+
</head>
|
| 14 |
+
<body class="bg-gray-50 min-h-screen flex flex-col">
|
| 15 |
+
<custom-navbar></custom-navbar>
|
| 16 |
+
|
| 17 |
+
<main class="flex-grow container mx-auto px-4 py-8">
|
| 18 |
+
<h1 class="text-4xl font-bold text-center mb-12 text-gray-800">AI Benchmark Universe</h1>
|
| 19 |
+
|
| 20 |
+
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-8">
|
| 21 |
+
<!-- GSM8k Benchmark Card -->
|
| 22 |
+
<div class="eval-card" data-eval="gsm8k">
|
| 23 |
+
<div class="card-inner">
|
| 24 |
+
<div class="card-front bg-indigo-600 text-white p-6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 25 |
+
<div>
|
| 26 |
+
<div class="flex items-center gap-2 mb-4">
|
| 27 |
+
<i data-feather="cpu" class="w-6 h-6"></i>
|
| 28 |
+
<h2 class="text-2xl font-bold">GSM8k</h2>
|
| 29 |
+
</div>
|
| 30 |
+
<p class="text-indigo-100 mb-4">Grade School Math with 8.5K problems</p>
|
| 31 |
+
<div class="flex flex-wrap gap-2">
|
| 32 |
+
<span class="badge">Mathematics</span>
|
| 33 |
+
<span class="badge">Reasoning</span>
|
| 34 |
+
</div>
|
| 35 |
+
</div>
|
| 36 |
+
<div class="mt-4 text-sm text-indigo-200 flex items-center gap-1">
|
| 37 |
+
<i data-feather="database"></i>
|
| 38 |
+
<span>8,500 examples</span>
|
| 39 |
+
</div>
|
| 40 |
+
</div>
|
| 41 |
+
<div class="card-back bg-white pβ6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 42 |
+
<div>
|
| 43 |
+
<h2 class="text-2xl font-bold text-indigo-600 mb-4">GSM8k Details</h2>
|
| 44 |
+
<p class="text-gray-700 mb-4">A benchmark for mathematical reasoning, testing grade school math problems requiring multi-step reasoning.</p>
|
| 45 |
+
<div class="space-y-3">
|
| 46 |
+
<div>
|
| 47 |
+
<h3 class="font-semibold text-gray-800">Evaluation Type:</h3>
|
| 48 |
+
<p class="text-gray-600">Mathematics, Reasoning</p>
|
| 49 |
+
</div>
|
| 50 |
+
<div>
|
| 51 |
+
<h3 class="font-semibold text-gray-800">Example:</h3>
|
| 52 |
+
<p class="text-gray-600">"John has 5 apples. He gives 2 to Mary and buys 3 more. How many does he have now?"</p>
|
| 53 |
+
</div>
|
| 54 |
+
</div>
|
| 55 |
+
</div>
|
| 56 |
+
<a href="#" class="text-indigo-600 hover:text-indigo-800 text-sm flex items-center gap-1">
|
| 57 |
+
<i data-feather="external-link"></i>
|
| 58 |
+
<span>View Full Details</span>
|
| 59 |
+
</a>
|
| 60 |
+
</div>
|
| 61 |
+
</div>
|
| 62 |
+
</div>
|
| 63 |
+
|
| 64 |
+
<!-- GPQA Benchmark Card -->
|
| 65 |
+
<div class="eval-card" data-eval="gpqa">
|
| 66 |
+
<div class="card-inner">
|
| 67 |
+
<div class="card-front bg-emerald-600 text-white p-6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 68 |
+
<div>
|
| 69 |
+
<div class="flex items-center gap-2 mb-4">
|
| 70 |
+
<i data-feather="book" class="w-6 h-6"></i>
|
| 71 |
+
<h2 class="text-2xl font-bold">GPQA</h2>
|
| 72 |
+
</div>
|
| 73 |
+
<p class="text-emerald-100 mb-4">General Purpose Question Answering</p>
|
| 74 |
+
<div class="flex flex-wrap gap-2">
|
| 75 |
+
<span class="badge">QA</span>
|
| 76 |
+
<span class="badge">Knowledge</span>
|
| 77 |
+
</div>
|
| 78 |
+
</div>
|
| 79 |
+
<div class="mt-4 text-sm text-emerald-200 flex items-center gap-1">
|
| 80 |
+
<i data-feather="database"></i>
|
| 81 |
+
<span>Multi-domain</span>
|
| 82 |
+
</div>
|
| 83 |
+
</div>
|
| 84 |
+
<div class="card-back bg-white pβ6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 85 |
+
<div>
|
| 86 |
+
<h2 class="text-2xl font-bold text-emerald-600 mb-4">GPQA Details</h2>
|
| 87 |
+
<p class="text-gray-700 mb-4">A comprehensive benchmark testing general knowledge across multiple domains.</p>
|
| 88 |
+
<div class="space-y-3">
|
| 89 |
+
<div>
|
| 90 |
+
<h3 class="font-semibold text-gray-800">Evaluation Type:</h3>
|
| 91 |
+
<p class="text-gray-600">Question Answering</p>
|
| 92 |
+
</div>
|
| 93 |
+
<div>
|
| 94 |
+
<h3 class="font-semibold text-gray-800">Example:</h3>
|
| 95 |
+
<p class="text-gray-600">"What is the capital of France?"</p>
|
| 96 |
+
</div>
|
| 97 |
+
</div>
|
| 98 |
+
</div>
|
| 99 |
+
<a href="#" class="text-emerald-600 hover:text-emerald-800 text-sm flex items-center gap-1">
|
| 100 |
+
<i data-feather="external-link"></i>
|
| 101 |
+
<span>View Full Details</span>
|
| 102 |
+
</a>
|
| 103 |
+
</div>
|
| 104 |
+
</div>
|
| 105 |
+
</div>
|
| 106 |
+
|
| 107 |
+
<!-- MMLU Benchmark Card -->
|
| 108 |
+
<div class="eval-card" data-eval="mmlu">
|
| 109 |
+
<div class="card-inner">
|
| 110 |
+
<div class="card-front bg-amber-600 text-white p-6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 111 |
+
<div>
|
| 112 |
+
<div class="flex items-center gap-2 mb-4">
|
| 113 |
+
<i data-feather="layers" class="w-6 h-6"></i>
|
| 114 |
+
<h2 class="text-2xl font-bold">MMLU</h2>
|
| 115 |
+
</div>
|
| 116 |
+
<p class="text-amber-100 mb-4">Massive Multitask Language Understanding</p>
|
| 117 |
+
<div class="flex flex-wrap gap-2">
|
| 118 |
+
<span class="badge">ML</span>
|
| 119 |
+
<span class="badge">Multitask</span>
|
| 120 |
+
</div>
|
| 121 |
+
</div>
|
| 122 |
+
<div class="mt-4 text-sm text-amber-200 flex items-center gap-1">
|
| 123 |
+
<i data-feather="database"></i>
|
| 124 |
+
<span>57 subjects</span>
|
| 125 |
+
</div>
|
| 126 |
+
</div>
|
| 127 |
+
<div class="card-back bg-white pβ6 rounded-xl shadow-xl flex flex-col justify-between">
|
| 128 |
+
<div>
|
| 129 |
+
<h2 class="text-2xl font-bold text-amber-600 mb-4">MMLU Details</h2>
|
| 130 |
+
<p class="text-gray-700 mb-4">Tests understanding across 57 subjects including STEM, humanities and more.</p>
|
| 131 |
+
<div class="space-y-3">
|
| 132 |
+
<div>
|
| 133 |
+
<h3 class="font-semibold text-gray-800">Evaluation Type:</h3>
|
| 134 |
+
<p class="text-gray-600">Multitask Understanding</p>
|
| 135 |
+
</div>
|
| 136 |
+
<div>
|
| 137 |
+
<h3 class="font-semibold text-gray-800">Example:</h3>
|
| 138 |
+
<p class="text-gray-600">"Explain the concept of entropy in thermodynamics."</p>
|
| 139 |
+
</div>
|
| 140 |
+
</div>
|
| 141 |
+
</div>
|
| 142 |
+
<a href="#" class="text
|
| 143 |
+
<script src="https://huggingface.co/deepsite/deepsite-badge.js"></script>
|
| 144 |
+
</body>
|
| 145 |
+
</html>
|