diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index e2fdd3c78bfef7d609c3b755310e6439737447f2..0000000000000000000000000000000000000000 --- a/.gitattributes +++ /dev/null @@ -1,35 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index db709e7c46f64068870faaf7cad388a6982ee2be..4a852626186fa83059af39e006ee1c8567a94ed6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,6 @@ -auto_evals/ venv/ __pycache__/ .env .ipynb_checkpoints *ipynb -.vscode/ - -eval-queue-bk/ -eval-results-bk/ -logs/ +.vscode/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 0710dad252bda2ac9fd5b7e4e2e4dc0afeff43cf..0000000000000000000000000000000000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -default_language_version: - python: python3 - -ci: - autofix_prs: true - autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' - autoupdate_schedule: quarterly - -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: check-yaml - - id: check-case-conflict - - id: detect-private-key - - id: check-added-large-files - args: ['--maxkb=1000'] - - id: requirements-txt-fixer - - id: end-of-file-fixer - - id: trailing-whitespace - - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 - hooks: - - id: isort - name: Format imports - - - repo: https://github.com/psf/black - rev: 22.12.0 - hooks: - - id: black - name: Format code - additional_dependencies: ['click==8.0.2'] - - - repo: https://github.com/charliermarsh/ruff-pre-commit - # Ruff version. - rev: 'v0.0.267' - hooks: - - id: ruff diff --git a/Makefile b/Makefile deleted file mode 100644 index b5685772804c8af4235a8504dc6752bfc9ae5d1d..0000000000000000000000000000000000000000 --- a/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -.PHONY: style format - - -style: - python -m black --line-length 119 . - python -m isort . - ruff check --fix . - - -quality: - python -m black --check --line-length 119 . - python -m isort --check-only . - ruff check . diff --git a/README.md b/README.md deleted file mode 100644 index 5020670dbf6c009a78c7524147ed400f59bc8388..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: SVBench -emoji: 🥇 -colorFrom: green -colorTo: indigo -sdk: gradio -app_file: app.py -pinned: true -license: apache-2.0 -short_description: Leaderboard for SVBench -sdk_version: 5.19.0 ---- - -# Start the configuration - -Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks). - -Results files should have the following format and be stored as json files: -```json -{ - "config": { - "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit - "model_name": "path of the model on the hub: org/model", - "model_sha": "revision on the hub", - }, - "results": { - "task_name": { - "metric_name": score, - }, - "task_name2": { - "metric_name": score, - } - } -} -``` - -Request files are created automatically by this tool. - -If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder. - -# Code logic for more complex edits - -You'll find -- the main table' columns names and properties in `src/display/utils.py` -- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py` -- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py` \ No newline at end of file diff --git a/app.py b/app.py index 624a69ec2ee0453a3f3ce14c496bb1960abcd4bb..fc80f1f03b0923f461e46a6d668bbdd7854adcbe 100644 --- a/app.py +++ b/app.py @@ -1,213 +1,119 @@ +__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] +import os + import gradio as gr -from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd -from apscheduler.schedulers.background import BackgroundScheduler -from huggingface_hub import snapshot_download - -from src.about import ( - CITATION_BUTTON_LABEL, - CITATION_BUTTON_TEXT, - EVALUATION_QUEUE_TEXT, - INTRODUCTION_TEXT, - LLM_BENCHMARKS_TEXT, - TITLE, -) -from src.display.css_html_js import custom_css -from src.display.utils import ( - BENCHMARK_COLS, - COLS, - EVAL_COLS, - EVAL_TYPES, - AutoEvalColumn, - ModelType, - fields, - WeightType, - Precision -) -from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN -from src.populate import get_evaluation_queue_df, get_leaderboard_df -from src.submission.submit import add_new_eval -import pdb -import os -def restart_space(): - API.restart_space(repo_id=REPO_ID) - -### Space initialisation -# try: -# print("EVAL_REQUESTS_PATH:",EVAL_REQUESTS_PATH) -# snapshot_download( -# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN -# ) -# except Exception: -# restart_space() -# try: -# print("EVAL_RESULTS_PATH:",EVAL_RESULTS_PATH) -# snapshot_download( -# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN -# ) -# except Exception: -# restart_space() - -# eval_results_path = os.path.join(EVAL_RESULTS_PATH, "svbench") -# eval_requests_path = os.path.join(EVAL_REQUESTS_PATH, "svbench") -LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) -print("EVAL_RESULTS_PATH:",EVAL_RESULTS_PATH) -# print("LEADERBOARD_DF:",LEADERBOARD_DF) - -( - finished_eval_queue_df, - running_eval_queue_df, - pending_eval_queue_df, -) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) - -def init_leaderboard(dataframe): - if dataframe is None or dataframe.empty: - raise ValueError("Leaderboard DataFrame is empty or None.") - - # Check for None in filter_columns - filter_columns = [ - ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), - ] - - return Leaderboard( - value=dataframe, - datatype=[c.type for c in fields(AutoEvalColumn)], - select_columns=SelectColumns( - default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], - cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], - label="Select Columns to Display:", - ), - search_columns=[AutoEvalColumn.model.name], - hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], - filter_columns=[ - # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), - # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), - # ColumnFilter( - # AutoEvalColumn.params.name, - # type="slider", - # min=0.01, - # max=150, - # label="Select the number of parameters (B)", - # ), - # ColumnFilter( - # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True - # ), - ], - bool_checkboxgroup_label="Hide models", +from constants import * + +global data_component, filter_component + +def get_baseline_df(): + df = pd.read_csv(CSV_DIR) + df['Average'] = ((df['Streaming_OS'] + df['Dialogue_OS']) / 2).round(2) + df = df.sort_values(by="Average", ascending=False) + present_columns = ['Model'] + checkbox_group.value + df = df[present_columns] + return df + +def get_all_df(): + df = pd.read_csv(CSV_DIR) + df['Average'] = ((df['Streaming_OS'] + df['Dialogue_OS']) / 2).round(2) + df = df.sort_values(by="Average", ascending=False) + return df + +def on_filter_model_size_method_change(selected_columns): + updated_data = get_all_df() + + # columns: + selected_columns = [item for item in TASK_INFO if item in selected_columns] + present_columns = ['Model'] + selected_columns + updated_data = updated_data[present_columns] + updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) + updated_headers = present_columns + update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers] + filter_component = gr.components.Dataframe( + value=updated_data, + headers=updated_headers, + type="pandas", + datatype=update_datatype, interactive=False, + visible=True, ) -demo = gr.Blocks(css=custom_css) -with demo: - gr.HTML(TITLE) - gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + return filter_component + +def search_model(query): + df = get_all_df() + filtered_df = df[df['Model'].str.contains(query, case=False)] + return filtered_df +block = gr.Blocks() + +with block: + gr.Markdown( + LEADERBORAD_INTRODUCTION + ) with gr.Tabs(elem_classes="tab-buttons") as tabs: - with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): - leaderboard = init_leaderboard(LEADERBOARD_DF) - - with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): - gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") - - with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): - with gr.Column(): - with gr.Row(): - gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") - - with gr.Column(): - with gr.Accordion( - f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", - open=False, - ): - with gr.Row(): - finished_eval_table = gr.components.Dataframe( - value=finished_eval_queue_df, - headers=EVAL_COLS, - datatype=EVAL_TYPES, - row_count=5, - ) - with gr.Accordion( - f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", - open=False, - ): - with gr.Row(): - running_eval_table = gr.components.Dataframe( - value=running_eval_queue_df, - headers=EVAL_COLS, - datatype=EVAL_TYPES, - row_count=5, - ) - - with gr.Accordion( - f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", - open=False, - ): - with gr.Row(): - pending_eval_table = gr.components.Dataframe( - value=pending_eval_queue_df, - headers=EVAL_COLS, - datatype=EVAL_TYPES, - row_count=5, - ) - with gr.Row(): - gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") - - with gr.Row(): - with gr.Column(): - model_name_textbox = gr.Textbox(label="Model name") - revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") - model_type = gr.Dropdown( - choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], - label="Model type", - multiselect=False, - value=None, - interactive=True, + with gr.TabItem("📊 SVBench", elem_id="svbench-tab-table", id=1): + with gr.Accordion("Citation", open=False): + citation_button = gr.Textbox( + value=CITATION_BUTTON_TEXT, + label=CITATION_BUTTON_LABEL, + elem_id="citation-button", + lines=10, ) + + gr.Markdown( + TABLE_INTRODUCTION + ) - with gr.Column(): - precision = gr.Dropdown( - choices=[i.value.name for i in Precision if i != Precision.Unknown], - label="Precision", - multiselect=False, - value="float16", - interactive=True, - ) - weight_type = gr.Dropdown( - choices=[i.value.name for i in WeightType], - label="Weights type", - multiselect=False, - value="Original", - interactive=True, - ) - base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") - - submit_button = gr.Button("Submit Eval") - submission_result = gr.Markdown() - submit_button.click( - add_new_eval, - [ - model_name_textbox, - base_model_name_textbox, - revision_name_textbox, - precision, - weight_type, - model_type, - ], - submission_result, + # selection for column part: + checkbox_group = gr.CheckboxGroup( + choices=TASK_INFO, + value=AVG_INFO, + label="Evaluation Dimension", + interactive=True, ) - with gr.Row(): - with gr.Accordion("📙 Citation", open=False): - citation_button = gr.Textbox( - value=CITATION_BUTTON_TEXT, - label=CITATION_BUTTON_LABEL, - lines=20, - elem_id="citation-button", - show_copy_button=True, + search_box = gr.Textbox( + label="Search Model", + placeholder="Enter model name", + interactive=True, + ) + + data_component = gr.components.Dataframe( + value=get_baseline_df, + headers=['Model', 'Type', 'Size'] + AVG_INFO, + type="pandas", + datatype=DATA_TITILE_TYPE, + interactive=False, + visible=True, ) -scheduler = BackgroundScheduler() -scheduler.add_job(restart_space, "interval", seconds=1800) -scheduler.start() -demo.queue(default_concurrency_limit=40).launch() \ No newline at end of file + checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component) + search_box.change(fn=search_model, inputs=[search_box], outputs=data_component) + + # table 2 + with gr.TabItem("📝 About", elem_id="svbench-tab-table", id=2): + gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text") + + # table 3 + with gr.TabItem("🚀 Submit here! ", elem_id="-tab-table", id=3): + gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") + + + + def refresh_data(): + value1 = get_baseline_df() + return value1 + + with gr.Row(): + data_run = gr.Button("Refresh") + with gr.Row(): + result_download = gr.Button("Download Leaderboard") + file_download = gr.File(label="download the csv of leaderboard.", visible=False) + data_run.click(on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component) + result_download.click(lambda: (CSV_DIR, gr.update(visible=True)), inputs=None, outputs=[file_download, file_download]) + +block.launch() \ No newline at end of file diff --git a/constants.py b/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..234c8d230264f7aa745ba1a4cf538baf7bb60b5e --- /dev/null +++ b/constants.py @@ -0,0 +1,42 @@ +import os +# this is .py for store constants +MODEL_INFO = ["Model"] +TASK_INFO = ["F/FPS", "Type", "Size","Dialogue_SA", "Dialogue_CC", "Dialogue_LC", "Dialogue_TU", "Dialogue_IC", "Dialogue_OS", "Streaming_SA", "Streaming_CC", "Streaming_LC", "Streaming_TU", "Streaming_IC", "Streaming_OS", "Average"] +AVG_INFO = ["Type", "Size", "F/FPS", "Dialogue_OS", "Streaming_OS","Average"] + +DATA_TITILE_TYPE = ['str', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] + +CSV_DIR = "./svbench.csv" + +COLUMN_NAMES = MODEL_INFO + TASK_INFO + +LEADERBORAD_INTRODUCTION = """# SVBench Leaderboard + + Welcome to the leaderboard of the SVBench! 🏆 + + SVBench is a benchmark specifically designed to evaluate the performance of Large Vision-Language Models (LVLMs) in long-context streaming video understanding tasks. This benchmark comprehensively assesses the models' capabilities in handling streaming videos through its unique temporal multi-turn question-answering chains. To facilitate research and development, SVBench provides a detailed leaderboard showcasing the performance results of over a dozen models on this benchmark. By ranking the models based on their performance on SVBench, users can quickly identify models that excel in specific tasks, thereby guiding subsequent research and applications. + Detailed information about SVBench and the leaderboard can be accessed via the following link: [SVBench Benchmark](https://yzy-bupt.github.io/SVBench). The paper is available at: [SVBench Paper](https://arxiv.org/abs/2502.10810). Additionally, the related dataset is hosted on the Hugging Face platform, and researchers can access it at [SVBench Dataset](https://huggingface.co/datasets/yzy666/SVBench) for further experiments and model development. + This leaderboard not only provides a fair competitive environment for current models but also serves as an important reference standard for future model improvements and innovations. + """ + +SUBMIT_INTRODUCTION = """ +# Leaderboard submissions can be made through the following link: [Leaderboard Submission](https://docs.google.com/forms/d/e/1FAIpQLSfz62pGaIdKjmDbOP0vw74dXSiG-2ILJI7gdugdx4pfWSc42Q/viewform). +""" +CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" +CITATION_BUTTON_TEXT = """ +@article{yang2025svbench, + title={SVBench: A Benchmark with Temporal Multi-Turn Dialogues for Streaming Video Understanding}, + author={Yang, Zhenyu and Hu, Yuhang and Du, Zemin and Xue, Dizhan and Qian, Shengsheng and Wu, Jiahong and Yang, Fan and Dong, Weiming and Xu, Changsheng}, + journal={arXiv preprint arXiv:2502.10810}, + year={2025} +} +""" + +TABLE_INTRODUCTION = """ + """ + +LEADERBORAD_INFO = """ + Despite the significant advancements of Large Vision-Language Models (LVLMs) on established benchmarks, there remains a notable gap in suitable evaluation regarding their applicability in the emerging domain of long-context streaming video understanding. Current benchmarks for video understanding typically emphasize isolated single-instance text inputs and fail to evaluate the capacity to sustain temporal reasoning throughout the entire duration of video streams. To address these limitations, we introduce SVBench, a pioneering benchmark with temporal multi-turn question-answering chains specifically designed to thoroughly assess the capabilities of streaming video understanding of current LVLMs. We design a semi-automated annotation pipeline to obtain 49,979 Question-Answer (QA) pairs of 1,353 streaming videos, which includes generating QA chains that represent a series of consecutive multi-turn dialogues over video segments and constructing temporal linkages between successive QA chains. Our experimental results, obtained from 14 models in dialogue and streaming evaluations, reveal that while the closed-source GPT-4o outperforms others, most open-source LVLMs struggle with long-context streaming video understanding. We also construct a StreamingChat model, which significantly outperforms open-source LVLMs on our SVBench and achieves comparable performance on diverse vision-language benchmarks. We expect SVBench to advance the research of streaming video understanding by providing a comprehensive and in-depth analysis of current LVLMs. +""" + + diff --git a/eval-queue/.gitattributes b/eval-queue/.gitattributes deleted file mode 100644 index 28df5f900b358436f0267334b3e3e9af33f917ba..0000000000000000000000000000000000000000 --- a/eval-queue/.gitattributes +++ /dev/null @@ -1,55 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/eval-queue/svbench/.gitattributes b/eval-queue/svbench/.gitattributes deleted file mode 100644 index 28df5f900b358436f0267334b3e3e9af33f917ba..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/.gitattributes +++ /dev/null @@ -1,55 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/eval-queue/svbench/Flash-VStream.json b/eval-queue/svbench/Flash-VStream.json deleted file mode 100644 index 3cef90a28288e007abe79b6bc374d411161ca571..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/Flash-VStream.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "Flash-VStream", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/GPT-4V.json b/eval-queue/svbench/GPT-4V.json deleted file mode 100644 index abad57297674b2ea9ea86eba74f39047e29dbb99..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/GPT-4V.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "GPT-4V", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "", - "likes": 0, - "params": 0, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/GPT-4o.json b/eval-queue/svbench/GPT-4o.json deleted file mode 100644 index fa82dbcccbdec26a24f4b1c89bbf9f03379b684d..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/GPT-4o.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "GPT-4o", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "", - "likes": 0, - "params": 0, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/Gemini 1.5 Pro.json b/eval-queue/svbench/Gemini 1.5 Pro.json deleted file mode 100644 index bd8c83437c27de23e3b26311e4e6c53710695ccb..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/Gemini 1.5 Pro.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "Gemini 1.5 Pro", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "", - "likes": 0, - "params": 0, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/InternLM-XC2.5.json b/eval-queue/svbench/InternLM-XC2.5.json deleted file mode 100644 index 67e3b24ba4d996fa3728073ff3944515489bdf51..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/InternLM-XC2.5.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "InternLM-XC2.5", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/InternVL2.json b/eval-queue/svbench/InternVL2.json deleted file mode 100644 index a50e7213673a82b9638d3956433bd16113ea5177..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/InternVL2.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "InternVL2", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "ImageLLM", - "likes": 0, - "params": 8, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/LLaVA-NeXT-Video.json b/eval-queue/svbench/LLaVA-NeXT-Video.json deleted file mode 100644 index c7907a8de3b69a76ade07a94ac6c259718ea7159..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/LLaVA-NeXT-Video.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "LLaVA-NeXT-Video", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/MiniCPM-V 2.6.json b/eval-queue/svbench/MiniCPM-V 2.6.json deleted file mode 100644 index cb289dcac2a81962156b3dcbf12c5400c80fa459..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/MiniCPM-V 2.6.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "MiniCPM-V 2.6", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "ImageLLM", - "likes": 0, - "params": 8, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/MovieChat.json b/eval-queue/svbench/MovieChat.json deleted file mode 100644 index 3b02f3f274d6fa4aade9f7432b8e1b434183a768..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/MovieChat.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "MovieChat", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/Qwen2-VL.json b/eval-queue/svbench/Qwen2-VL.json deleted file mode 100644 index 159f0f7d4a77d3983ac0cbbd8b5a2373702c6578..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/Qwen2-VL.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "Qwen2-VL", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "ImageLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/ShareGPT4Video.json b/eval-queue/svbench/ShareGPT4Video.json deleted file mode 100644 index ced05e66dedd670f9aa693f6e16bd73611c8eaed..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/ShareGPT4Video.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "ShareGPT4Video", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 8, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/TimeChat.json b/eval-queue/svbench/TimeChat.json deleted file mode 100644 index 85e3f6feb91818e059808670fbc743e0ffa90ffe..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/TimeChat.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "TimeChat", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/VILA.json b/eval-queue/svbench/VILA.json deleted file mode 100644 index 92e15069f96180cd5b603c2dc1a174eb339375b1..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/VILA.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "VILA", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "ImageLLM", - "likes": 0, - "params": 8, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/Video-ChatGPT.json b/eval-queue/svbench/Video-ChatGPT.json deleted file mode 100644 index f38cb876f7420fa3c3dc522923d026bcc7576cbc..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/Video-ChatGPT.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "Video-ChatGPT", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/Video-LLaVA.json b/eval-queue/svbench/Video-LLaVA.json deleted file mode 100644 index b555cdff24df450c20fbb2ece93cd6457e491dd4..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/Video-LLaVA.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "Video-LLaVA", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-queue/svbench/VideoLLaMA2.json b/eval-queue/svbench/VideoLLaMA2.json deleted file mode 100644 index 23f9933b413f9c33635cf19ba1f3619c0178687c..0000000000000000000000000000000000000000 --- a/eval-queue/svbench/VideoLLaMA2.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model": "VideoLLaMA2", - "base_model": "", - "revision": "float16", - "precision": "", - "weight_type": "", - "status": "FINISHED", - "submitted_time": "", - "model_type": "VideoLLM", - "likes": 0, - "params": 7, - "license": "", - "private": false -} \ No newline at end of file diff --git a/eval-results/.gitattributes b/eval-results/.gitattributes deleted file mode 100644 index 28df5f900b358436f0267334b3e3e9af33f917ba..0000000000000000000000000000000000000000 --- a/eval-results/.gitattributes +++ /dev/null @@ -1,55 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/eval-results/svbench/.gitattributes b/eval-results/svbench/.gitattributes deleted file mode 100644 index 28df5f900b358436f0267334b3e3e9af33f917ba..0000000000000000000000000000000000000000 --- a/eval-results/svbench/.gitattributes +++ /dev/null @@ -1,55 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/eval-results/svbench/Flash-VStream/results_Flash-VStream.json b/eval-results/svbench/Flash-VStream/results_Flash-VStream.json deleted file mode 100644 index e98b149a1b5ef1cf19262078451dcec52f481428..0000000000000000000000000000000000000000 --- a/eval-results/svbench/Flash-VStream/results_Flash-VStream.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "Flash-VStream", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.3754 - }, - "Dialogue_CC": { - "acc": 0.4474 - }, - "Dialogue_LC": { - "acc": 0.5102 - }, - "Dialogue_TU": { - "acc": 0.4795 - }, - "Dialogue_IC": { - "acc": 0.3794 - }, - "Dialogue_OS": { - "acc": 0.4272 - }, - "Streaming_SA": { - "acc": 0.3571 - }, - "Streaming_CC": { - "acc": 0.4424 - }, - "Streaming_LC": { - "acc": 0.4849 - }, - "Streaming_TU": { - "acc": 0.3895 - }, - "Streaming_IC": { - "acc": 0.3900 - }, - "Streaming_OS": { - "acc": 0.3880 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/GPT-4V/results_GPT-4V.json b/eval-results/svbench/GPT-4V/results_GPT-4V.json deleted file mode 100644 index 915548b6933cd579e86c2d69e83829e654384725..0000000000000000000000000000000000000000 --- a/eval-results/svbench/GPT-4V/results_GPT-4V.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "GPT-4V", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.5603 - }, - "Dialogue_CC": { - "acc": 0.6261 - }, - "Dialogue_LC": { - "acc": 0.6909 - }, - "Dialogue_TU": { - "acc": 0.6536 - }, - "Dialogue_IC": { - "acc": 0.5373 - }, - "Dialogue_OS": { - "acc": 0.6030 - }, - "Streaming_SA": { - "acc": 0.5637 - }, - "Streaming_CC": { - "acc": 0.6141 - }, - "Streaming_LC": { - "acc": 0.6580 - }, - "Streaming_TU": { - "acc": 0.5918 - }, - "Streaming_IC": { - "acc": 0.5716 - }, - "Streaming_OS": { - "acc": 0.5793 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/GPT-4o/results_GPT-4o.json b/eval-results/svbench/GPT-4o/results_GPT-4o.json deleted file mode 100644 index c3547fd6ab798bfc0fc874d2ddc130d78b404a70..0000000000000000000000000000000000000000 --- a/eval-results/svbench/GPT-4o/results_GPT-4o.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "GPT-4o", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.5826 - }, - "Dialogue_CC": { - "acc": 0.6476 - }, - "Dialogue_LC": { - "acc": 0.7075 - }, - "Dialogue_TU": { - "acc": 0.6768 - }, - "Dialogue_IC": { - "acc": 0.5582 - }, - "Dialogue_OS": { - "acc": 0.6257 - }, - "Streaming_SA": { - "acc": 0.5799 - }, - "Streaming_CC": { - "acc": 0.6352 - }, - "Streaming_LC": { - "acc": 0.6772 - }, - "Streaming_TU": { - "acc": 0.6018 - }, - "Streaming_IC": { - "acc": 0.5925 - }, - "Streaming_OS": { - "acc": 0.5997 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/Gemini 1.5 Pro/results_Gemini 1.5 Pro.json b/eval-results/svbench/Gemini 1.5 Pro/results_Gemini 1.5 Pro.json deleted file mode 100644 index 2d7c5945888957b95deef3e620e47be486ef78b7..0000000000000000000000000000000000000000 --- a/eval-results/svbench/Gemini 1.5 Pro/results_Gemini 1.5 Pro.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "Gemini 1.5 Pro", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.4907 - }, - "Dialogue_CC": { - "acc": 0.5615 - }, - "Dialogue_LC": { - "acc": 0.6224 - }, - "Dialogue_TU": { - "acc": 0.5836 - }, - "Dialogue_IC": { - "acc": 0.4772 - }, - "Dialogue_OS": { - "acc": 0.5368 - }, - "Streaming_SA": { - "acc": 0.4935 - }, - "Streaming_CC": { - "acc": 0.5577 - }, - "Streaming_LC": { - "acc": 0.6041 - }, - "Streaming_TU": { - "acc": 0.5289 - }, - "Streaming_IC": { - "acc": 0.5111 - }, - "Streaming_OS": { - "acc": 0.5155 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/InternLM-XC2.5/results_InternLM-XC2.5.json b/eval-results/svbench/InternLM-XC2.5/results_InternLM-XC2.5.json deleted file mode 100644 index 5ab6dfc177d5a657350c45cfdb76878caf0ddddf..0000000000000000000000000000000000000000 --- a/eval-results/svbench/InternLM-XC2.5/results_InternLM-XC2.5.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "InternLM-XC2.5", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.4651 - }, - "Dialogue_CC": { - "acc": 0.5316 - }, - "Dialogue_LC": { - "acc": 0.5984 - }, - "Dialogue_TU": { - "acc": 0.5294 - }, - "Dialogue_IC": { - "acc": 0.4587 - }, - "Dialogue_OS": { - "acc": 0.5071 - }, - "Streaming_SA": { - "acc": 0.5262 - }, - "Streaming_CC": { - "acc": 0.5855 - }, - "Streaming_LC": { - "acc": 0.6289 - }, - "Streaming_TU": { - "acc": 0.5398 - }, - "Streaming_IC": { - "acc": 0.5439 - }, - "Streaming_OS": { - "acc": 0.5439 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/InternVL2/results_InternVL2.json b/eval-results/svbench/InternVL2/results_InternVL2.json deleted file mode 100644 index bd2f259a27ea007f15cd2b3f6bdb5f2698dee6b5..0000000000000000000000000000000000000000 --- a/eval-results/svbench/InternVL2/results_InternVL2.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "InternVL2", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.4053 - }, - "Dialogue_CC": { - "acc": 0.4677 - }, - "Dialogue_LC": { - "acc": 0.5238 - }, - "Dialogue_TU": { - "acc": 0.4697 - }, - "Dialogue_IC": { - "acc": 0.4035 - }, - "Dialogue_OS": { - "acc": 0.4448 - }, - "Streaming_SA": { - "acc": 0.3892 - }, - "Streaming_CC": { - "acc": 0.4542 - }, - "Streaming_LC": { - "acc": 0.5045 - }, - "Streaming_TU": { - "acc": 0.4153 - }, - "Streaming_IC": { - "acc": 0.4235 - }, - "Streaming_OS": { - "acc": 0.4162 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/LLaVA-NeXT-Video/results_LLaVA-NeXT-Video.json b/eval-results/svbench/LLaVA-NeXT-Video/results_LLaVA-NeXT-Video.json deleted file mode 100644 index 9834c8e471835b50eb5a09a794e16780e8c76639..0000000000000000000000000000000000000000 --- a/eval-results/svbench/LLaVA-NeXT-Video/results_LLaVA-NeXT-Video.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "LLaVA-NeXT-Video", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.3771 - }, - "Dialogue_CC": { - "acc": 0.4459 - }, - "Dialogue_LC": { - "acc": 0.5205 - }, - "Dialogue_TU": { - "acc": 0.4180 - }, - "Dialogue_IC": { - "acc": 0.3658 - }, - "Dialogue_OS": { - "acc": 0.4140 - }, - "Streaming_SA": { - "acc": 0.3429 - }, - "Streaming_CC": { - "acc": 0.3968 - }, - "Streaming_LC": { - "acc": 0.4765 - }, - "Streaming_TU": { - "acc": 0.3533 - }, - "Streaming_IC": { - "acc": 0.3668 - }, - "Streaming_OS": { - "acc": 0.3612 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/MiniCPM-V 2.6/results_MiniCPM-V 2.6.json b/eval-results/svbench/MiniCPM-V 2.6/results_MiniCPM-V 2.6.json deleted file mode 100644 index 6970f3f16453cbd3e1e020b767e1786bcb81cf2a..0000000000000000000000000000000000000000 --- a/eval-results/svbench/MiniCPM-V 2.6/results_MiniCPM-V 2.6.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "MiniCPM-V 2.6", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.5170 - }, - "Dialogue_CC": { - "acc": 0.5950 - }, - "Dialogue_LC": { - "acc": 0.6533 - }, - "Dialogue_TU": { - "acc": 0.6172 - }, - "Dialogue_IC": { - "acc": 0.5009 - }, - "Dialogue_OS": { - "acc": 0.5663 - }, - "Streaming_SA": { - "acc": 0.4644 - }, - "Streaming_CC": { - "acc": 0.5273 - }, - "Streaming_LC": { - "acc": 0.5835 - }, - "Streaming_TU": { - "acc": 0.5348 - }, - "Streaming_IC": { - "acc": 0.4832 - }, - "Streaming_OS": { - "acc": 0.4967 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/MovieChat/results_MovieChat.json b/eval-results/svbench/MovieChat/results_MovieChat.json deleted file mode 100644 index 6871d9e7d77f4ce3ba01e86a62058d3884201797..0000000000000000000000000000000000000000 --- a/eval-results/svbench/MovieChat/results_MovieChat.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "MovieChat", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.2036 - }, - "Dialogue_CC": { - "acc": 0.2374 - }, - "Dialogue_LC": { - "acc": 0.2897 - }, - "Dialogue_TU": { - "acc": 0.228 - }, - "Dialogue_IC": { - "acc": 0.2051 - }, - "Dialogue_OS": { - "acc": 0.2272 - }, - "Streaming_SA": { - "acc": 0.1892 - }, - "Streaming_CC": { - "acc": 0.2238 - }, - "Streaming_LC": { - "acc": 0.2677 - }, - "Streaming_TU": { - "acc": 0.2046 - }, - "Streaming_IC": { - "acc": 0.2098 - }, - "Streaming_OS": { - "acc": 0.1964 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/Qwen2-VL/results_Qwen2-VL.json b/eval-results/svbench/Qwen2-VL/results_Qwen2-VL.json deleted file mode 100644 index 34dd7f1383d77a3ad8cbec306b9dba99e6100ab9..0000000000000000000000000000000000000000 --- a/eval-results/svbench/Qwen2-VL/results_Qwen2-VL.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "Qwen2-VL", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.5047 - }, - "Dialogue_CC": { - "acc": 0.5771 - }, - "Dialogue_LC": { - "acc": 0.6346 - }, - "Dialogue_TU": { - "acc": 0.6077 - }, - "Dialogue_IC": { - "acc": 0.4944 - }, - "Dialogue_OS": { - "acc": 0.5529 - }, - "Streaming_SA": { - "acc": 0.4838 - }, - "Streaming_CC": { - "acc": 0.5517 - }, - "Streaming_LC": { - "acc": 0.5991 - }, - "Streaming_TU": { - "acc": 0.5204 - }, - "Streaming_IC": { - "acc": 0.5142 - }, - "Streaming_OS": { - "acc": 0.5139 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/ShareGPT4Video/results_ShareGPT4Video.json b/eval-results/svbench/ShareGPT4Video/results_ShareGPT4Video.json deleted file mode 100644 index 2c923ab0702146170c34c9d8b26b31992bc2b8cf..0000000000000000000000000000000000000000 --- a/eval-results/svbench/ShareGPT4Video/results_ShareGPT4Video.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "ShareGPT4Video", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.3626 - }, - "Dialogue_CC": { - "acc": 0.4368 - }, - "Dialogue_LC": { - "acc": 0.5012 - }, - "Dialogue_TU": { - "acc": 0.4733 - }, - "Dialogue_IC": { - "acc": 0.3725 - }, - "Dialogue_OS": { - "acc": 0.4176 - }, - "Streaming_SA": { - "acc": 0.3314 - }, - "Streaming_CC": { - "acc": 0.4048 - }, - "Streaming_LC": { - "acc": 0.4601 - }, - "Streaming_TU": { - "acc": 0.3815 - }, - "Streaming_IC": { - "acc": 0.3781 - }, - "Streaming_OS": { - "acc": 0.3710 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/TimeChat/results_TimeChat.json b/eval-results/svbench/TimeChat/results_TimeChat.json deleted file mode 100644 index 51e6f423b8e86421ce3932eb7b1d67290906e9e1..0000000000000000000000000000000000000000 --- a/eval-results/svbench/TimeChat/results_TimeChat.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "TimeChat", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.3109 - }, - "Dialogue_CC": { - "acc": 0.3857 - }, - "Dialogue_LC": { - "acc": 0.4552 - }, - "Dialogue_TU": { - "acc": 0.4337 - }, - "Dialogue_IC": { - "acc": 0.3110 - }, - "Dialogue_OS": { - "acc": 0.3624 - }, - "Streaming_SA": { - "acc": 0.2714 - }, - "Streaming_CC": { - "acc": 0.3442 - }, - "Streaming_LC": { - "acc": 0.3978 - }, - "Streaming_TU": { - "acc": 0.3680 - }, - "Streaming_IC": { - "acc": 0.3171 - }, - "Streaming_OS": { - "acc": 0.3115 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/VILA/results_VILA.json b/eval-results/svbench/VILA/results_VILA.json deleted file mode 100644 index 4b131f31aed04b443989c35dc8820527679672d8..0000000000000000000000000000000000000000 --- a/eval-results/svbench/VILA/results_VILA.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "VILA", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.4323 - }, - "Dialogue_CC": { - "acc": 0.4930 - }, - "Dialogue_LC": { - "acc": 0.5559 - }, - "Dialogue_TU": { - "acc": 0.5247 - }, - "Dialogue_IC": { - "acc": 0.4127 - }, - "Dialogue_OS": { - "acc": 0.4707 - }, - "Streaming_SA": { - "acc": 0.3819 - }, - "Streaming_CC": { - "acc": 0.4427 - }, - "Streaming_LC": { - "acc": 0.4918 - }, - "Streaming_TU": { - "acc": 0.4129 - }, - "Streaming_IC": { - "acc": 0.4055 - }, - "Streaming_OS": { - "acc": 0.4038 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/Video-ChatGPT/results_Video-ChatGPT.json b/eval-results/svbench/Video-ChatGPT/results_Video-ChatGPT.json deleted file mode 100644 index 4f4bb7b582b7ceb985a75616aca440e051791425..0000000000000000000000000000000000000000 --- a/eval-results/svbench/Video-ChatGPT/results_Video-ChatGPT.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "Video-ChatGPT", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.2801 - }, - "Dialogue_CC": { - "acc": 0.3404 - }, - "Dialogue_LC": { - "acc": 0.4089 - }, - "Dialogue_TU": { - "acc": 0.3566 - }, - "Dialogue_IC": { - "acc": 0.2959 - }, - "Dialogue_OS": { - "acc": 0.3224 - }, - "Streaming_SA": { - "acc": 0.2284 - }, - "Streaming_CC": { - "acc": 0.2844 - }, - "Streaming_LC": { - "acc": 0.3393 - }, - "Streaming_TU": { - "acc": 0.2631 - }, - "Streaming_IC": { - "acc": 0.2643 - }, - "Streaming_OS": { - "acc": 0.2502 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/Video-LLaVA/results_Video-LLaVA.json b/eval-results/svbench/Video-LLaVA/results_Video-LLaVA.json deleted file mode 100644 index db27183a421cc6d82b790ac05ea2b3b27247a238..0000000000000000000000000000000000000000 --- a/eval-results/svbench/Video-LLaVA/results_Video-LLaVA.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "Video-LLaVA", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.3185 - }, - "Dialogue_CC": { - "acc": 0.3838 - }, - "Dialogue_LC": { - "acc": 0.4493 - }, - "Dialogue_TU": { - "acc": 0.4154 - }, - "Dialogue_IC": { - "acc": 0.3280 - }, - "Dialogue_OS": { - "acc": 0.3649 - }, - "Streaming_SA": { - "acc": 0.2695 - }, - "Streaming_CC": { - "acc": 0.3368 - }, - "Streaming_LC": { - "acc": 0.3900 - }, - "Streaming_TU": { - "acc": 0.3183 - }, - "Streaming_IC": { - "acc": 0.3153 - }, - "Streaming_OS": { - "acc": 0.2989 - } - } -} \ No newline at end of file diff --git a/eval-results/svbench/VideoLLaMA2/results_VideoLLaMA2.json b/eval-results/svbench/VideoLLaMA2/results_VideoLLaMA2.json deleted file mode 100644 index 269338aff4ae62837971dacc515c796a534725a0..0000000000000000000000000000000000000000 --- a/eval-results/svbench/VideoLLaMA2/results_VideoLLaMA2.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "config": { - "model_dtype": "torch.float16", - "model_name": "VideoLLaMA2", - "model_sha": "" - }, - "results": { - "Dialogue_SA": { - "acc": 0.4250 - }, - "Dialogue_CC": { - "acc": 0.4988 - }, - "Dialogue_LC": { - "acc": 0.5596 - }, - "Dialogue_TU": { - "acc": 0.5223 - }, - "Dialogue_IC": { - "acc": 0.4140 - }, - "Dialogue_OS": { - "acc": 0.4710 - }, - "Streaming_SA": { - "acc": 0.3895 - }, - "Streaming_CC": { - "acc": 0.4611 - }, - "Streaming_LC": { - "acc": 0.5177 - }, - "Streaming_TU": { - "acc": 0.4369 - }, - "Streaming_IC": { - "acc": 0.4222 - }, - "Streaming_OS": { - "acc": 0.4277 - } - } -} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 3b4737924b5a7d81c962a4e28b66ac6cdcc3b004..0000000000000000000000000000000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[tool.ruff] -# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. -select = ["E", "F"] -ignore = ["E501"] # line too long (black is taking care of this) -line-length = 119 -fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] - -[tool.isort] -profile = "black" -line_length = 119 - -[tool.black] -line-length = 119 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3cacab3e9afab55f2ce3493ac25d7a0ea5c96255..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -APScheduler -black -datasets -gradio -gradio[oauth] -gradio_leaderboard==0.0.13 -gradio_client -huggingface-hub>=0.18.0 -matplotlib -numpy -pandas -python-dateutil -tqdm -transformers -tokenizers>=0.15.0 -sentencepiece \ No newline at end of file diff --git a/src/about.py b/src/about.py deleted file mode 100644 index 8803293869d1b562ae687693612922cc6c96d0c9..0000000000000000000000000000000000000000 --- a/src/about.py +++ /dev/null @@ -1,90 +0,0 @@ -from dataclasses import dataclass -from enum import Enum - -@dataclass -class Task: - benchmark: str - metric: str - col_name: str - - -# Select your tasks here -# --------------------------------------------------- -class Tasks(Enum): - # task_key in the json file, metric_key in the json file, name to display in the leaderboard - task0 = Task("Dialogue_SA", "acc", "Dialogue_SA") - task1 = Task("Dialogue_CC", "acc", "Dialogue_CC") - task2 = Task("Dialogue_LC", "acc", "Dialogue_LC") - task3 = Task("Dialogue_TU", "acc", "Dialogue_TU") - task4 = Task("Dialogue_IC", "acc", "Dialogue_IC") - task5 = Task("Dialogue_OS", "acc", "Dialogue_OS") - task6 = Task("Streaming_SA", "acc", "Streaming_SA") - task7 = Task("Streaming_CC", "acc", "Streaming_CC") - task8 = Task("Streaming_LC", "acc", "Streaming_LC") - task9 = Task("Streaming_TU", "acc", "Streaming_TU") - task10 = Task("Streaming_IC", "acc", "Streaming_IC") - task11 = Task("Streaming_OS", "acc", "Streaming_OS") - -NUM_FEWSHOT = 0 # Change with your few shot -# --------------------------------------------------- - - - -# Your leaderboard name -TITLE = """
{error}
" - - -def styled_warning(warn): - return f"{warn}
" - - -def styled_message(message): - return f"{message}
" - - -def has_no_nan_values(df, columns): - return df[columns].notna().all(axis=1) - - -def has_nan_values(df, columns): - return df[columns].isna().any(axis=1) diff --git a/src/display/utils.py b/src/display/utils.py deleted file mode 100644 index 0d36924c8d2559f2a0876852e95b2540eb14dffb..0000000000000000000000000000000000000000 --- a/src/display/utils.py +++ /dev/null @@ -1,125 +0,0 @@ -from dataclasses import dataclass, make_dataclass -from enum import Enum - -import pandas as pd - -from src.about import Tasks - -def fields(raw_class): - return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] - - -# These classes are for user facing column names, -# to avoid having to change them all around the code -# when a modif is needed -@dataclass -class ColumnContent: - name: str - type: str - displayed_by_default: bool - hidden: bool = False - never_hidden: bool = False - -## Leaderboard columns -auto_eval_column_dict = [] -# Init -# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) -auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) -#Scores -auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]) -for task in Tasks: - auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) -# Model information -auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) -# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) -# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) -# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)]) -# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) -auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) -# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) -# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) -# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) - -# We use make dataclass to dynamically fill the scores from Tasks -AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) - -## For the queue columns in the submission tab -@dataclass(frozen=True) -class EvalQueueColumn: # Queue column - model = ColumnContent("model", "markdown", True) - # revision = ColumnContent("revision", "str", True) - # private = ColumnContent("private", "bool", True) - # precision = ColumnContent("precision", "str", True) - # weight_type = ColumnContent("weight_type", "str", "Original") - # status = ColumnContent("status", "str", True) - -## All the model information that we might need -@dataclass -class ModelDetails: - name: str - display_name: str = "" - symbol: str = "" # emoji - -class ModelType(Enum): - VideoLLM = ModelDetails(name="VideoLLM", symbol="🎥") - ImageLLM = ModelDetails(name="ImageLLM", symbol="🖼️") - Unknown = ModelDetails(name="", symbol="?") - - def to_str(self, separator=" "): - return f"{self.value.symbol}{separator}{self.value.name}" - - @staticmethod - def from_str(type): - if "VideoLLM" in type or "🎥" in type: - return ModelType.VideoLLM - if "ImageLLM" in type or "🖼️" in type: - return ModelType.ImageLLM - return ModelType.Unknown - -# class ModelType(Enum): -# PT = ModelDetails(name="pretrained", symbol="🟢") -# FT = ModelDetails(name="fine-tuned", symbol="🔶") -# IFT = ModelDetails(name="instruction-tuned", symbol="⭕") -# RL = ModelDetails(name="RL-tuned", symbol="🟦") -# Unknown = ModelDetails(name="", symbol="?") - -# def to_str(self, separator=" "): -# return f"{self.value.symbol}{separator}{self.value.name}" - -# @staticmethod -# def from_str(type): -# if "fine-tuned" in type or "🔶" in type: -# return ModelType.FT -# if "pretrained" in type or "🟢" in type: -# return ModelType.PT -# if "RL-tuned" in type or "🟦" in type: -# return ModelType.RL -# if "instruction-tuned" in type or "⭕" in type: -# return ModelType.IFT -# return ModelType.Unknown - -class WeightType(Enum): - Adapter = ModelDetails("Adapter") - Original = ModelDetails("Original") - Delta = ModelDetails("Delta") - -class Precision(Enum): - float16 = ModelDetails("float16") - bfloat16 = ModelDetails("bfloat16") - Unknown = ModelDetails("?") - - def from_str(precision): - if precision in ["torch.float16", "float16"]: - return Precision.float16 - if precision in ["torch.bfloat16", "bfloat16"]: - return Precision.bfloat16 - return Precision.Unknown - -# Column selection -COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] - -EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] -EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] - -BENCHMARK_COLS = [t.value.col_name for t in Tasks] - diff --git a/src/envs.py b/src/envs.py deleted file mode 100644 index a81f3863ebb9d432671f3f07026723f7ece5f891..0000000000000000000000000000000000000000 --- a/src/envs.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -from huggingface_hub import HfApi - -# Info to change for your repository -# ---------------------------------- -TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org - -OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format! -# ---------------------------------- - -REPO_ID = f"{OWNER}/leaderboard" -QUEUE_REPO = f"{OWNER}/requests" -RESULTS_REPO = f"{OWNER}/results" - -# If you setup a cache later, just change HF_HOME -CACHE_PATH=os.getenv("HF_HOME", ".") - -# Local caches -EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue/svbench") -EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results/svbench") -EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") -EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") - -API = HfApi(token=TOKEN) diff --git a/src/leaderboard/read_evals.py b/src/leaderboard/read_evals.py deleted file mode 100644 index bd06afae89a59505bc0248ff09006085665d27d1..0000000000000000000000000000000000000000 --- a/src/leaderboard/read_evals.py +++ /dev/null @@ -1,199 +0,0 @@ -import glob -import json -import math -import os -from dataclasses import dataclass - -import dateutil -import numpy as np - -from src.display.formatting import make_clickable_model -from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType -from src.submission.check_validity import is_model_on_hub - - -@dataclass -class EvalResult: - """Represents one full evaluation. Built from a combination of the result and request file for a given run. - """ - eval_name: str # org_model_precision (uid) - full_model: str # org/model (path on hub) - org: str - model: str - revision: str # commit hash, "" if main - results: dict - precision: Precision = Precision.Unknown - model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... - weight_type: WeightType = WeightType.Original # Original or Adapter - architecture: str = "Unknown" - license: str = "?" - likes: int = 0 - num_params: int = 0 - date: str = "" # submission date of request file - still_on_hub: bool = False - - @classmethod - def init_from_json_file(self, json_filepath): - """Inits the result from the specific model result file""" - with open(json_filepath) as fp: - data = json.load(fp) - - config = data.get("config") - - # Precision - precision = Precision.from_str(config.get("model_dtype")) - - # Get model and org - org_and_model = config.get("model_name", config.get("model_args", None)) - org_and_model = org_and_model.split("/", 1) - - if len(org_and_model) == 1: - org = "svbench" - model = org_and_model[0] - result_key = f"{model}_{precision.value.name}" - else: - org = org_and_model[0] - model = org_and_model[1] - result_key = f"{org}_{model}_{precision.value.name}" - full_model = "/".join(org_and_model) - - still_on_hub, _, model_config = is_model_on_hub( - full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False - ) - architecture = "?" - if model_config is not None: - architectures = getattr(model_config, "architectures", None) - if architectures: - architecture = ";".join(architectures) - - # Extract results available in this file (some results are split in several files) - results = {} - for task in Tasks: - task = task.value - - # We average all scores of a given metric (not all metrics are present in all files) - accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k]) - if accs.size == 0 or any([acc is None for acc in accs]): - continue - - mean_acc = np.mean(accs) * 100.0 - results[task.benchmark] = mean_acc - - return self( - eval_name=result_key, - full_model=full_model, - org=org, - model=model, - results=results, - precision=precision, - revision= config.get("model_sha", ""), - still_on_hub=still_on_hub, - architecture=architecture - ) - - def update_with_request_file(self, requests_path): - """Finds the relevant request file for the current model and updates info with it""" - request_file = get_request_file_for_model(requests_path, self.model, self.precision.value.name) - print("requests_path:",requests_path) - try: - with open(request_file, "r") as f: - request = json.load(f) - print(f"Request file content: {request}") # 调试输出 - self.model_type = ModelType.from_str(request.get("model_type", "")) - self.weight_type = WeightType[request.get("weight_type", "Original")] - self.license = request.get("license", "?") - self.likes = request.get("likes", 0) - self.num_params = request.get("params", "") - self.date = request.get("submitted_time", "") - except FileNotFoundError: - print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}") - except json.JSONDecodeError: - print(f"Error decoding JSON from request file for {self.org}/{self.model} with precision {self.precision.value.name}") - except Exception as e: - print(f"An unexpected error occurred: {e}") - - def to_dict(self): - """Converts the Eval Result to a dict compatible with our dataframe display""" - average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) - data_dict = { - "eval_name": self.eval_name, # not a column, just a save name, - # AutoEvalColumn.precision.name: self.precision.value.name, - AutoEvalColumn.model_type.name: self.model_type.value.name, - # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, - # AutoEvalColumn.weight_type.name: self.weight_type.value.name, - # AutoEvalColumn.architecture.name: self.architecture, - AutoEvalColumn.model.name: make_clickable_model(self.full_model), - # AutoEvalColumn.revision.name: self.revision, - AutoEvalColumn.average.name: average, - # AutoEvalColumn.license.name: self.license, - # AutoEvalColumn.likes.name: self.likes, - AutoEvalColumn.params.name: self.num_params, - # AutoEvalColumn.still_on_hub.name: self.still_on_hub, - } - - for task in Tasks: - data_dict[task.value.col_name] = self.results[task.value.benchmark] - - return data_dict - - -def get_request_file_for_model(requests_path, model_name, precision): - """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" - request_files_pattern = os.path.join(requests_path, f"{model_name}.json") - print(f"Looking for request files with pattern: {request_files_pattern}") # 调试输出 - request_files = glob.glob(request_files_pattern) - - # Select correct request file (precision) - request_file = "" - request_files = sorted(request_files, reverse=True) - for tmp_request_file in request_files: - with open(tmp_request_file, "r") as f: - req_content = json.load(f) - print(f"Checking request file: {tmp_request_file}, Content: {req_content}") # 调试输出 - if ( - req_content["status"] in ["FINISHED"] - and req_content["precision"] == precision.split(".")[-1] - ): - request_file = tmp_request_file - return request_file - - -def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: - """From the path of the results folder root, extract all needed info for results""" - model_result_filepaths = [] - - for root, _, files in os.walk(results_path): - # We should only have json files in model results - if len(files) == 0 or any([not f.endswith(".json") for f in files]): - continue - - # Sort the files by date - # try: - # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) - # except dateutil.parser._parser.ParserError: - # files = [files[-1]] - - for file in files: - model_result_filepaths.append(os.path.join(root, file)) - - eval_results = {} - for model_result_filepath in model_result_filepaths: - # Creation of result - eval_result = EvalResult.init_from_json_file(model_result_filepath) - eval_result.update_with_request_file(requests_path) - # Store results of same eval together - eval_name = eval_result.eval_name - if eval_name in eval_results.keys(): - eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) - else: - eval_results[eval_name] = eval_result - - results = [] - for v in eval_results.values(): - try: - v.to_dict() # we test if the dict version is complete - results.append(v) - except KeyError: # not all eval values present - continue - - return results diff --git a/src/populate.py b/src/populate.py deleted file mode 100644 index 8804f2ee908d901c032c80f94a7e35c121d83196..0000000000000000000000000000000000000000 --- a/src/populate.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -import os -import pprint -import pandas as pd - -from src.display.formatting import has_no_nan_values, make_clickable_model -from src.display.utils import AutoEvalColumn, EvalQueueColumn -from src.leaderboard.read_evals import get_raw_eval_results - - -def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: - """Creates a dataframe from all the individual experiment results""" - raw_data = get_raw_eval_results(results_path, requests_path) - all_data_json = [v.to_dict() for v in raw_data] - - df = pd.DataFrame.from_records(all_data_json) - df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) - df = df[cols].round(decimals=2) - - # filter out if any of the benchmarks have not been produced - df = df[has_no_nan_values(df, benchmark_cols)] - return df - - -def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: - """Creates the different dataframes for the evaluation queues requestes""" - entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] - all_evals = [] - - for entry in entries: - if ".json" in entry: - file_path = os.path.join(save_path, entry) - with open(file_path) as fp: - data = json.load(fp) - - data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) - # data[EvalQueueColumn.revision.name] = data.get("revision", "main") - - all_evals.append(data) - elif ".md" not in entry: - # this is a folder - sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] - for sub_entry in sub_entries: - file_path = os.path.join(save_path, entry, sub_entry) - with open(file_path) as fp: - data = json.load(fp) - - data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) - data[EvalQueueColumn.revision.name] = data.get("revision", "main") - all_evals.append(data) - - pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] - running_list = [e for e in all_evals if e["status"] == "RUNNING"] - finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] - df_pending = pd.DataFrame.from_records(pending_list, columns=cols) - df_running = pd.DataFrame.from_records(running_list, columns=cols) - df_finished = pd.DataFrame.from_records(finished_list, columns=cols) - return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py deleted file mode 100644 index d06ee4c444178e369214fbf33d82e81c6f087850..0000000000000000000000000000000000000000 --- a/src/submission/check_validity.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -import os -import re -from collections import defaultdict -from datetime import datetime, timedelta, timezone - -import huggingface_hub -from huggingface_hub import ModelCard -from huggingface_hub.hf_api import ModelInfo -from transformers import AutoConfig -from transformers.models.auto.tokenization_auto import AutoTokenizer - -def check_model_card(repo_id: str) -> tuple[bool, str]: - """Checks if the model card and license exist and have been filled""" - try: - card = ModelCard.load(repo_id) - except huggingface_hub.utils.EntryNotFoundError: - return False, "Please add a model card to your model to explain how you trained/fine-tuned it." - - # Enforce license metadata - if card.data.license is None: - if not ("license_name" in card.data and "license_link" in card.data): - return False, ( - "License not found. Please add a license to your model card using the `license` metadata or a" - " `license_name`/`license_link` pair." - ) - - # Enforce card content - if len(card.text) < 200: - return False, "Please add a description to your model card, it is too short." - - return True, "" - -def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]: - """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses.""" - try: - config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) - if test_tokenizer: - try: - tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) - except ValueError as e: - return ( - False, - f"uses a tokenizer which is not in a transformers release: {e}", - None - ) - except Exception as e: - return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None) - return True, None, config - - except ValueError: - return ( - False, - "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", - None - ) - - except Exception as e: - return False, "was not found on hub!", None - - -def get_model_size(model_info: ModelInfo, precision: str): - """Gets the model size from the configuration, or the model name if the configuration does not contain the information.""" - try: - model_size = round(model_info.safetensors["total"] / 1e9, 3) - except (AttributeError, TypeError): - return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py - - size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 - model_size = size_factor * model_size - return model_size - -def get_model_arch(model_info: ModelInfo): - """Gets the model architecture from the configuration""" - return model_info.config.get("architectures", "Unknown") - -def already_submitted_models(requested_models_dir: str) -> set[str]: - """Gather a list of already submitted models to avoid duplicates""" - depth = 1 - file_names = [] - users_to_submission_dates = defaultdict(list) - - for root, _, files in os.walk(requested_models_dir): - current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) - if current_depth == depth: - for file in files: - if not file.endswith(".json"): - continue - with open(os.path.join(root, file), "r") as f: - info = json.load(f) - file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}") - - # Select organisation - if info["model"].count("/") == 0 or "submitted_time" not in info: - continue - organisation, _ = info["model"].split("/") - users_to_submission_dates[organisation].append(info["submitted_time"]) - - return set(file_names), users_to_submission_dates diff --git a/src/submission/submit.py b/src/submission/submit.py deleted file mode 100644 index cac6ea48e803a0af42dabe5226191c769dbec71d..0000000000000000000000000000000000000000 --- a/src/submission/submit.py +++ /dev/null @@ -1,119 +0,0 @@ -import json -import os -from datetime import datetime, timezone - -from src.display.formatting import styled_error, styled_message, styled_warning -from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO -from src.submission.check_validity import ( - already_submitted_models, - check_model_card, - get_model_size, - is_model_on_hub, -) - -REQUESTED_MODELS = None -USERS_TO_SUBMISSION_DATES = None - -def add_new_eval( - model: str, - base_model: str, - revision: str, - precision: str, - weight_type: str, - model_type: str, -): - global REQUESTED_MODELS - global USERS_TO_SUBMISSION_DATES - if not REQUESTED_MODELS: - REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) - - user_name = "" - model_path = model - if "/" in model: - user_name = model.split("/")[0] - model_path = model.split("/")[1] - - precision = precision.split(" ")[0] - current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - - if model_type is None or model_type == "": - return styled_error("Please select a model type.") - - # Does the model actually exist? - if revision == "": - revision = "main" - - # Is the model on the hub? - if weight_type in ["Delta", "Adapter"]: - base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True) - if not base_model_on_hub: - return styled_error(f'Base model "{base_model}" {error}') - - if not weight_type == "Adapter": - model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True) - if not model_on_hub: - return styled_error(f'Model "{model}" {error}') - - # Is the model info correctly filled? - try: - model_info = API.model_info(repo_id=model, revision=revision) - except Exception: - return styled_error("Could not get your model information. Please fill it up properly.") - - model_size = get_model_size(model_info=model_info, precision=precision) - - # Were the model card and license filled? - try: - license = model_info.cardData["license"] - except Exception: - return styled_error("Please select a license for your model") - - modelcard_OK, error_msg = check_model_card(model) - if not modelcard_OK: - return styled_error(error_msg) - - # Seems good, creating the eval - print("Adding new eval") - - eval_entry = { - "model": model, - "base_model": base_model, - "revision": revision, - "precision": precision, - "weight_type": weight_type, - "status": "PENDING", - "submitted_time": current_time, - "model_type": model_type, - "likes": model_info.likes, - "params": model_size, - "license": license, - "private": False, - } - - # Check for duplicate submission - if f"{model}_{revision}_{precision}" in REQUESTED_MODELS: - return styled_warning("This model has been already submitted.") - - print("Creating eval file") - OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" - os.makedirs(OUT_DIR, exist_ok=True) - out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json" - - with open(out_path, "w") as f: - f.write(json.dumps(eval_entry)) - - print("Uploading eval file") - API.upload_file( - path_or_fileobj=out_path, - path_in_repo=out_path.split("eval-queue/")[1], - repo_id=QUEUE_REPO, - repo_type="dataset", - commit_message=f"Add {model} to eval queue", - ) - - # Remove the local file - os.remove(out_path) - - return styled_message( - "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list." - ) diff --git a/svbench.csv b/svbench.csv new file mode 100644 index 0000000000000000000000000000000000000000..e6e7aa16393ed287cf2161f1a0acb78871e6c8a3 --- /dev/null +++ b/svbench.csv @@ -0,0 +1,17 @@ +Model,Type,Size,F/FPS,Dialogue_SA,Dialogue_CC,Dialogue_LC,Dialogue_TU,Dialogue_IC,Dialogue_OS,Streaming_SA,Streaming_CC,Streaming_LC,Streaming_TU,Streaming_IC,Streaming_OS,Average +MovieChat,VideoLLM,7B,2048,20.36,23.74,28.97,22.8,20.51,22.72,18.92,22.38,26.77,20.46,20.98,19.64,21.18 +Video-ChatGPT,VideoLLM,7B,100,28.01,34.04,40.89,35.66,29.59,32.24,22.84,28.44,33.93,26.31,26.43,25.02,28.63 +Video-LLaVA,VideoLLM,7B,8,31.85,38.38,44.93,41.54,32.8,36.49,26.95,33.68,39.0,31.83,31.53,29.89,33.19 +TimeChat,VideoLLM,7B,16,31.09,38.57,45.52,43.37,31.1,36.24,27.14,34.42,39.78,36.8,31.71,31.15,33.7 +LLaVA-NeXT-Video,VideoLLM,7B,16,37.71,44.59,52.05,41.8,36.58,41.4,34.29,39.68,47.65,35.33,36.68,36.12,38.76 +ShareGPT4Video,VideoLLM,8B,16,36.26,43.68,50.12,47.33,37.25,41.76,33.14,40.48,46.01,38.15,37.81,37.1,39.43 +Flash-VStream,VideoLLM,7B,8,37.54,44.74,51.02,47.95,37.94,42.72,35.71,44.24,48.49,38.95,39.0,38.8,40.76 +InternVL2,ImageLLM,8B,8,40.53,46.77,52.38,46.97,40.35,44.48,38.92,45.42,50.45,41.53,42.35,41.62,43.05 +VILA,ImageLLM,8B,8,43.23,49.3,55.59,52.47,41.27,47.07,38.19,44.27,49.18,41.29,40.55,40.38,43.73 +VideoLLaMA2,VideoLLM,7B,8,42.5,49.88,55.96,52.23,41.4,47.1,38.95,46.11,51.77,43.69,42.22,42.77,44.94 +InternLM-XC2.5,VideoLLM,7B,32,46.51,53.16,59.84,52.94,45.87,50.71,52.62,58.55,62.89,53.98,54.39,54.39,52.55 +MiniCPM-V 2.6,ImageLLM,8B,64,51.7,59.5,65.33,61.72,50.09,56.63,46.44,52.73,58.35,53.48,48.32,49.67,53.15 +Qwen2-VL,ImageLLM,7B,8,50.47,57.71,63.46,60.77,49.44,55.29,48.38,55.17,59.91,52.04,51.42,51.39,53.34 +Gemini 1.5 Pro,-,-,1fps,49.07,56.15,62.24,58.36,47.72,53.68,49.35,55.77,60.41,52.89,51.11,51.55,52.62 +GPT-4V,-,-,10,56.03,62.61,69.09,65.36,53.73,60.3,56.37,61.41,65.8,59.18,57.16,57.93,59.12 +GPT-4o,-,-,25,58.26,64.76,70.75,67.68,55.82,62.57,57.99,63.52,67.72,60.18,59.25,59.97,61.27