yzy666 commited on
Commit
17ea755
·
1 Parent(s): 29d237e

Update space

Browse files
Files changed (43) hide show
  1. .gitignore +0 -2
  2. app.py +40 -31
  3. eval-queue/.gitattributes +55 -0
  4. eval-queue/svbench/.gitattributes +55 -0
  5. eval-queue/svbench/Flash-VStream.json +14 -0
  6. eval-queue/svbench/GPT-4V.json +14 -0
  7. eval-queue/svbench/GPT-4o.json +14 -0
  8. eval-queue/svbench/Gemini 1.5 Pro.json +14 -0
  9. eval-queue/svbench/InternLM-XC2.5.json +14 -0
  10. eval-queue/svbench/InternVL2.json +14 -0
  11. eval-queue/svbench/LLaVA-NeXT-Video.json +14 -0
  12. eval-queue/svbench/MiniCPM-V 2.6.json +14 -0
  13. eval-queue/svbench/MovieChat.json +14 -0
  14. eval-queue/svbench/Qwen2-VL.json +14 -0
  15. eval-queue/svbench/ShareGPT4Video.json +14 -0
  16. eval-queue/svbench/TimeChat.json +14 -0
  17. eval-queue/svbench/VILA.json +14 -0
  18. eval-queue/svbench/Video-ChatGPT.json +14 -0
  19. eval-queue/svbench/Video-LLaVA.json +14 -0
  20. eval-queue/svbench/VideoLLaMA2.json +14 -0
  21. eval-results/.gitattributes +55 -0
  22. eval-results/svbench/.gitattributes +55 -0
  23. eval-results/svbench/Flash-VStream/results_Flash-VStream.json +45 -0
  24. eval-results/svbench/GPT-4V/results_GPT-4V.json +45 -0
  25. eval-results/svbench/GPT-4o/results_GPT-4o.json +45 -0
  26. eval-results/svbench/Gemini 1.5 Pro/results_Gemini 1.5 Pro.json +45 -0
  27. eval-results/svbench/InternLM-XC2.5/results_InternLM-XC2.5.json +45 -0
  28. eval-results/svbench/InternVL2/results_InternVL2.json +45 -0
  29. eval-results/svbench/LLaVA-NeXT-Video/results_LLaVA-NeXT-Video.json +45 -0
  30. eval-results/svbench/MiniCPM-V 2.6/results_MiniCPM-V 2.6.json +45 -0
  31. eval-results/svbench/MovieChat/results_MovieChat.json +45 -0
  32. eval-results/svbench/Qwen2-VL/results_Qwen2-VL.json +45 -0
  33. eval-results/svbench/ShareGPT4Video/results_ShareGPT4Video.json +45 -0
  34. eval-results/svbench/TimeChat/results_TimeChat.json +45 -0
  35. eval-results/svbench/VILA/results_VILA.json +45 -0
  36. eval-results/svbench/Video-ChatGPT/results_Video-ChatGPT.json +45 -0
  37. eval-results/svbench/Video-LLaVA/results_Video-LLaVA.json +45 -0
  38. eval-results/svbench/VideoLLaMA2/results_VideoLLaMA2.json +45 -0
  39. src/about.py +21 -3
  40. src/display/utils.py +41 -26
  41. src/envs.py +2 -2
  42. src/leaderboard/read_evals.py +26 -23
  43. src/populate.py +2 -2
.gitignore CHANGED
@@ -6,8 +6,6 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- eval-queue/
10
- eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
6
  *ipynb
7
  .vscode/
8
 
 
 
9
  eval-queue-bk/
10
  eval-results-bk/
11
  logs/
app.py CHANGED
@@ -27,29 +27,33 @@ from src.display.utils import (
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
-
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -60,6 +64,12 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -68,27 +78,26 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
91
-
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
 
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
+ import pdb
31
+ import os
32
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
  ### Space initialisation
37
+ # try:
38
+ # print("EVAL_REQUESTS_PATH:",EVAL_REQUESTS_PATH)
39
+ # snapshot_download(
40
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ # )
42
+ # except Exception:
43
+ # restart_space()
44
+ # try:
45
+ # print("EVAL_RESULTS_PATH:",EVAL_RESULTS_PATH)
46
+ # snapshot_download(
47
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ # )
49
+ # except Exception:
50
+ # restart_space()
51
+
52
+ # eval_results_path = os.path.join(EVAL_RESULTS_PATH, "svbench")
53
+ # eval_requests_path = os.path.join(EVAL_REQUESTS_PATH, "svbench")
54
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
55
+ print("EVAL_RESULTS_PATH:",EVAL_RESULTS_PATH)
56
+ # print("LEADERBOARD_DF:",LEADERBOARD_DF)
57
 
58
  (
59
  finished_eval_queue_df,
 
64
  def init_leaderboard(dataframe):
65
  if dataframe is None or dataframe.empty:
66
  raise ValueError("Leaderboard DataFrame is empty or None.")
67
+
68
+ # Check for None in filter_columns
69
+ filter_columns = [
70
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
71
+ ]
72
+
73
  return Leaderboard(
74
  value=dataframe,
75
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
78
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
79
  label="Select Columns to Display:",
80
  ),
81
+ search_columns=[AutoEvalColumn.model.name],
82
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
83
  filter_columns=[
84
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
85
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
86
+ # ColumnFilter(
87
+ # AutoEvalColumn.params.name,
88
+ # type="slider",
89
+ # min=0.01,
90
+ # max=150,
91
+ # label="Select the number of parameters (B)",
92
+ # ),
93
+ # ColumnFilter(
94
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
95
+ # ),
96
  ],
97
  bool_checkboxgroup_label="Hide models",
98
  interactive=False,
99
  )
100
 
 
101
  demo = gr.Blocks(css=custom_css)
102
  with demo:
103
  gr.HTML(TITLE)
eval-queue/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-queue/svbench/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-queue/svbench/Flash-VStream.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Flash-VStream",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/GPT-4V.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "GPT-4V",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/GPT-4o.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "GPT-4o",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/Gemini 1.5 Pro.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Gemini 1.5 Pro",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/InternLM-XC2.5.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "InternLM-XC2.5",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/InternVL2.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "InternVL2",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "ImageLLM",
10
+ "likes": 0,
11
+ "params": 8,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/LLaVA-NeXT-Video.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "LLaVA-NeXT-Video",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/MiniCPM-V 2.6.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "MiniCPM-V 2.6",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "ImageLLM",
10
+ "likes": 0,
11
+ "params": 8,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/MovieChat.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "MovieChat",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/Qwen2-VL.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen2-VL",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "ImageLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/ShareGPT4Video.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ShareGPT4Video",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 8,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/TimeChat.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "TimeChat",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/VILA.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "VILA",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "ImageLLM",
10
+ "likes": 0,
11
+ "params": 8,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/Video-ChatGPT.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Video-ChatGPT",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/Video-LLaVA.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Video-LLaVA",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-queue/svbench/VideoLLaMA2.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "VideoLLaMA2",
3
+ "base_model": "",
4
+ "revision": "float16",
5
+ "precision": "",
6
+ "weight_type": "",
7
+ "status": "FINISHED",
8
+ "submitted_time": "",
9
+ "model_type": "VideoLLM",
10
+ "likes": 0,
11
+ "params": 7,
12
+ "license": "",
13
+ "private": false
14
+ }
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/svbench/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/svbench/Flash-VStream/results_Flash-VStream.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Flash-VStream",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.3754
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4474
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5102
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4795
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.3794
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4272
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3571
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.4424
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.4849
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.3895
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.3900
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.3880
43
+ }
44
+ }
45
+ }
eval-results/svbench/GPT-4V/results_GPT-4V.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "GPT-4V",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.5603
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.6261
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.6909
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.6536
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.5373
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.6030
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.5637
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.6141
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.6580
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.5918
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.5716
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.5793
43
+ }
44
+ }
45
+ }
eval-results/svbench/GPT-4o/results_GPT-4o.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "GPT-4o",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.5826
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.6476
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.7075
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.6768
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.5582
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.6257
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.5799
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.6352
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.6772
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.6018
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.5925
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.5997
43
+ }
44
+ }
45
+ }
eval-results/svbench/Gemini 1.5 Pro/results_Gemini 1.5 Pro.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Gemini 1.5 Pro",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.4907
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.5615
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.6224
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.5836
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4772
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.5368
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.4935
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.5577
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.6041
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.5289
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.5111
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.5155
43
+ }
44
+ }
45
+ }
eval-results/svbench/InternLM-XC2.5/results_InternLM-XC2.5.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "InternLM-XC2.5",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.4651
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.5316
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5984
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.5294
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4587
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.5071
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.5262
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.5855
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.6289
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.5398
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.5439
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.5439
43
+ }
44
+ }
45
+ }
eval-results/svbench/InternVL2/results_InternVL2.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "InternVL2",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.4053
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4677
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5238
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4697
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4035
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4448
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3892
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.4542
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.5045
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.4153
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.4235
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.4162
43
+ }
44
+ }
45
+ }
eval-results/svbench/LLaVA-NeXT-Video/results_LLaVA-NeXT-Video.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "LLaVA-NeXT-Video",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.3771
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4459
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5205
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4180
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.3658
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4140
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3429
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.3968
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.4765
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.3533
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.3668
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.3612
43
+ }
44
+ }
45
+ }
eval-results/svbench/MiniCPM-V 2.6/results_MiniCPM-V 2.6.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "MiniCPM-V 2.6",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.5170
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.5950
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.6533
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.6172
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.5009
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.5663
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.4644
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.5273
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.5835
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.5348
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.4832
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.4967
43
+ }
44
+ }
45
+ }
eval-results/svbench/MovieChat/results_MovieChat.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "MovieChat",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.2036
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.2374
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.2897
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.228
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.2051
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.2272
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.1892
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.2238
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.2677
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.2046
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.2098
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.1964
43
+ }
44
+ }
45
+ }
eval-results/svbench/Qwen2-VL/results_Qwen2-VL.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Qwen2-VL",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.5047
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.5771
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.6346
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.6077
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4944
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.5529
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.4838
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.5517
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.5991
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.5204
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.5142
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.5139
43
+ }
44
+ }
45
+ }
eval-results/svbench/ShareGPT4Video/results_ShareGPT4Video.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "ShareGPT4Video",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.3626
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4368
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5012
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4733
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.3725
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4176
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3314
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.4048
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.4601
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.3815
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.3781
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.3710
43
+ }
44
+ }
45
+ }
eval-results/svbench/TimeChat/results_TimeChat.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "TimeChat",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.3109
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.3857
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.4552
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4337
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.3110
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.3624
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.2714
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.3442
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.3978
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.3680
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.3171
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.3115
43
+ }
44
+ }
45
+ }
eval-results/svbench/VILA/results_VILA.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "VILA",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.4323
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4930
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5559
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.5247
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4127
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4707
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3819
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.4427
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.4918
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.4129
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.4055
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.4038
43
+ }
44
+ }
45
+ }
eval-results/svbench/Video-ChatGPT/results_Video-ChatGPT.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Video-ChatGPT",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.2801
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.3404
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.4089
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.3566
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.2959
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.3224
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.2284
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.2844
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.3393
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.2631
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.2643
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.2502
43
+ }
44
+ }
45
+ }
eval-results/svbench/Video-LLaVA/results_Video-LLaVA.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "Video-LLaVA",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.3185
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.3838
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.4493
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.4154
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.3280
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.3649
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.2695
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.3368
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.3900
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.3183
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.3153
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.2989
43
+ }
44
+ }
45
+ }
eval-results/svbench/VideoLLaMA2/results_VideoLLaMA2.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "VideoLLaMA2",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "Dialogue_SA": {
9
+ "acc": 0.4250
10
+ },
11
+ "Dialogue_CC": {
12
+ "acc": 0.4988
13
+ },
14
+ "Dialogue_LC": {
15
+ "acc": 0.5596
16
+ },
17
+ "Dialogue_TU": {
18
+ "acc": 0.5223
19
+ },
20
+ "Dialogue_IC": {
21
+ "acc": 0.4140
22
+ },
23
+ "Dialogue_OS": {
24
+ "acc": 0.4710
25
+ },
26
+ "Streaming_SA": {
27
+ "acc": 0.3895
28
+ },
29
+ "Streaming_CC": {
30
+ "acc": 0.4611
31
+ },
32
+ "Streaming_LC": {
33
+ "acc": 0.5177
34
+ },
35
+ "Streaming_TU": {
36
+ "acc": 0.4369
37
+ },
38
+ "Streaming_IC": {
39
+ "acc": 0.4222
40
+ },
41
+ "Streaming_OS": {
42
+ "acc": 0.4277
43
+ }
44
+ }
45
+ }
src/about.py CHANGED
@@ -12,8 +12,18 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -25,7 +35,9 @@ TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
@@ -69,4 +81,10 @@ If everything is done, check you can launch the EleutherAIHarness on your model
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
72
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("Dialogue_SA", "acc", "Dialogue_SA")
16
+ task1 = Task("Dialogue_CC", "acc", "Dialogue_CC")
17
+ task2 = Task("Dialogue_LC", "acc", "Dialogue_LC")
18
+ task3 = Task("Dialogue_TU", "acc", "Dialogue_TU")
19
+ task4 = Task("Dialogue_IC", "acc", "Dialogue_IC")
20
+ task5 = Task("Dialogue_OS", "acc", "Dialogue_OS")
21
+ task6 = Task("Streaming_SA", "acc", "Streaming_SA")
22
+ task7 = Task("Streaming_CC", "acc", "Streaming_CC")
23
+ task8 = Task("Streaming_LC", "acc", "Streaming_LC")
24
+ task9 = Task("Streaming_TU", "acc", "Streaming_TU")
25
+ task10 = Task("Streaming_IC", "acc", "Streaming_IC")
26
+ task11 = Task("Streaming_OS", "acc", "Streaming_OS")
27
 
28
  NUM_FEWSHOT = 0 # Change with your few shot
29
  # ---------------------------------------------------
 
35
 
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_TEXT = """
38
+ SVBench is a benchmark specifically designed to evaluate the performance of Large Vision-Language Models (LVLMs) in long-context streaming video understanding tasks. This benchmark comprehensively assesses the models' capabilities in handling streaming videos through its unique temporal multi-turn question-answering chains. To facilitate research and development, SVBench provides a detailed leaderboard showcasing the performance results of over a dozen models on this benchmark. By ranking the models based on their performance on SVBench, users can quickly identify models that excel in specific tasks, thereby guiding subsequent research and applications.
39
+ Detailed information about SVBench and the leaderboard can be accessed via the following link: [SVBench Benchmark](https://yzy-bupt.github.io/SVBench). The paper is available at: [SVBench Paper](https://arxiv.org/abs/2502.10810). Leaderboard submissions can be made through the following link: [Leaderboard Submission](https://docs.google.com/forms/d/e/1FAIpQLSfz62pGaIdKjmDbOP0vw74dXSiG-2ILJI7gdugdx4pfWSc42Q/viewform). Additionally, the related dataset is hosted on the Hugging Face platform, and researchers can access it at [SVBench Dataset](https://huggingface.co/datasets/yzy666/SVBench) for further experiments and model development.
40
+ This leaderboard not only provides a fair competitive environment for current models but also serves as an important reference standard for future model improvements and innovations.
41
  """
42
 
43
  # Which evaluations are you running? how can people reproduce what you have?
 
81
 
82
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
83
  CITATION_BUTTON_TEXT = r"""
84
+ @article{yang2025svbench,
85
+ title={SVBench: A Benchmark with Temporal Multi-Turn Dialogues for Streaming Video Understanding},
86
+ author={Yang, Zhenyu and Hu, Yuhang and Du, Zemin and Xue, Dizhan and Qian, Shengsheng and Wu, Jiahong and Yang, Fan and Dong, Weiming and Xu, Changsheng},
87
+ journal={arXiv preprint arXiv:2502.10810},
88
+ year={2025}
89
+ }
90
  """
src/display/utils.py CHANGED
@@ -23,7 +23,7 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
@@ -31,14 +31,14 @@ for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -47,11 +47,11 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
57
  @dataclass
@@ -60,12 +60,9 @@ class ModelDetails:
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
63
-
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -73,16 +70,34 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
+ # revision = ColumnContent("revision", "str", True)
51
+ # private = ColumnContent("private", "bool", True)
52
+ # precision = ColumnContent("precision", "str", True)
53
+ # weight_type = ColumnContent("weight_type", "str", "Original")
54
+ # status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
57
  @dataclass
 
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
 
63
  class ModelType(Enum):
64
+ VideoLLM = ModelDetails(name="VideoLLM", symbol="🎥")
65
+ ImageLLM = ModelDetails(name="ImageLLM", symbol="🖼️")
 
 
66
  Unknown = ModelDetails(name="", symbol="?")
67
 
68
  def to_str(self, separator=" "):
 
70
 
71
  @staticmethod
72
  def from_str(type):
73
+ if "VideoLLM" in type or "🎥" in type:
74
+ return ModelType.VideoLLM
75
+ if "ImageLLM" in type or "🖼️" in type:
76
+ return ModelType.ImageLLM
 
 
 
 
77
  return ModelType.Unknown
78
 
79
+ # class ModelType(Enum):
80
+ # PT = ModelDetails(name="pretrained", symbol="🟢")
81
+ # FT = ModelDetails(name="fine-tuned", symbol="🔶")
82
+ # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
83
+ # RL = ModelDetails(name="RL-tuned", symbol="🟦")
84
+ # Unknown = ModelDetails(name="", symbol="?")
85
+
86
+ # def to_str(self, separator=" "):
87
+ # return f"{self.value.symbol}{separator}{self.value.name}"
88
+
89
+ # @staticmethod
90
+ # def from_str(type):
91
+ # if "fine-tuned" in type or "🔶" in type:
92
+ # return ModelType.FT
93
+ # if "pretrained" in type or "🟢" in type:
94
+ # return ModelType.PT
95
+ # if "RL-tuned" in type or "🟦" in type:
96
+ # return ModelType.RL
97
+ # if "instruction-tuned" in type or "⭕" in type:
98
+ # return ModelType.IFT
99
+ # return ModelType.Unknown
100
+
101
  class WeightType(Enum):
102
  Adapter = ModelDetails("Adapter")
103
  Original = ModelDetails("Original")
src/envs.py CHANGED
@@ -17,8 +17,8 @@ RESULTS_REPO = f"{OWNER}/results"
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue/svbench")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results/svbench")
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
src/leaderboard/read_evals.py CHANGED
@@ -48,7 +48,7 @@ class EvalResult:
48
  org_and_model = org_and_model.split("/", 1)
49
 
50
  if len(org_and_model) == 1:
51
- org = None
52
  model = org_and_model[0]
53
  result_key = f"{model}_{precision.value.name}"
54
  else:
@@ -93,37 +93,42 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
 
101
  self.model_type = ModelType.from_str(request.get("model_type", ""))
102
  self.weight_type = WeightType[request.get("weight_type", "Original")]
103
  self.license = request.get("license", "?")
104
  self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
- except Exception:
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
  AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
@@ -134,11 +139,9 @@ class EvalResult:
134
 
135
  def get_request_file_for_model(requests_path, model_name, precision):
136
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
 
143
  # Select correct request file (precision)
144
  request_file = ""
@@ -146,6 +149,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
 
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
  and req_content["precision"] == precision.split(".")[-1]
@@ -164,10 +168,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
164
  continue
165
 
166
  # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
@@ -177,7 +181,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
  eval_result.update_with_request_file(requests_path)
180
-
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
183
  if eval_name in eval_results.keys():
 
48
  org_and_model = org_and_model.split("/", 1)
49
 
50
  if len(org_and_model) == 1:
51
+ org = "svbench"
52
  model = org_and_model[0]
53
  result_key = f"{model}_{precision.value.name}"
54
  else:
 
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
+ request_file = get_request_file_for_model(requests_path, self.model, self.precision.value.name)
97
+ print("requests_path:",requests_path)
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
+ print(f"Request file content: {request}") # 调试输出
102
  self.model_type = ModelType.from_str(request.get("model_type", ""))
103
  self.weight_type = WeightType[request.get("weight_type", "Original")]
104
  self.license = request.get("license", "?")
105
  self.likes = request.get("likes", 0)
106
+ self.num_params = request.get("params", "")
107
  self.date = request.get("submitted_time", "")
108
+ except FileNotFoundError:
109
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
110
+ except json.JSONDecodeError:
111
+ print(f"Error decoding JSON from request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
+ except Exception as e:
113
+ print(f"An unexpected error occurred: {e}")
114
 
115
  def to_dict(self):
116
  """Converts the Eval Result to a dict compatible with our dataframe display"""
117
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
+ # AutoEvalColumn.precision.name: self.precision.value.name,
121
  AutoEvalColumn.model_type.name: self.model_type.value.name,
122
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
123
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
124
+ # AutoEvalColumn.architecture.name: self.architecture,
125
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
126
+ # AutoEvalColumn.revision.name: self.revision,
127
  AutoEvalColumn.average.name: average,
128
+ # AutoEvalColumn.license.name: self.license,
129
+ # AutoEvalColumn.likes.name: self.likes,
130
  AutoEvalColumn.params.name: self.num_params,
131
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
132
  }
133
 
134
  for task in Tasks:
 
139
 
140
  def get_request_file_for_model(requests_path, model_name, precision):
141
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
142
+ request_files_pattern = os.path.join(requests_path, f"{model_name}.json")
143
+ print(f"Looking for request files with pattern: {request_files_pattern}") # 调试输出
144
+ request_files = glob.glob(request_files_pattern)
 
 
145
 
146
  # Select correct request file (precision)
147
  request_file = ""
 
149
  for tmp_request_file in request_files:
150
  with open(tmp_request_file, "r") as f:
151
  req_content = json.load(f)
152
+ print(f"Checking request file: {tmp_request_file}, Content: {req_content}") # 调试输出
153
  if (
154
  req_content["status"] in ["FINISHED"]
155
  and req_content["precision"] == precision.split(".")[-1]
 
168
  continue
169
 
170
  # Sort the files by date
171
+ # try:
172
+ # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
173
+ # except dateutil.parser._parser.ParserError:
174
+ # files = [files[-1]]
175
 
176
  for file in files:
177
  model_result_filepaths.append(os.path.join(root, file))
 
181
  # Creation of result
182
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
183
  eval_result.update_with_request_file(requests_path)
 
184
  # Store results of same eval together
185
  eval_name = eval_result.eval_name
186
  if eval_name in eval_results.keys():
src/populate.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -34,7 +34,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
 
1
  import json
2
  import os
3
+ import pprint
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry: