Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| from pathlib import Path | |
| from leaderboard_tab import ( | |
| create_leaderboard_tab, | |
| search_leaderboard, | |
| update_columns_to_show, | |
| ) | |
| from utils import load_json_results | |
| # Constants | |
| RETRIEVAL_ABOUT_SECTION = """ | |
| ## About Retrieval Evaluation | |
| The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: | |
| ### Web Search Dataset Metrics | |
| - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result | |
| - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results | |
| - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results | |
| - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 | |
| ### Model Requirements | |
| - Must support Arabic text embeddings | |
| - Should handle queries of at least 512 tokens | |
| - Must work with `sentence-transformers` library | |
| ### Evaluation Process | |
| 1. Models process Arabic web search queries | |
| 2. Retrieved documents are evaluated using: | |
| - MRR for first relevant result positioning | |
| - nDCG for overall ranking quality | |
| - Recall@5 for top results accuracy | |
| 3. Metrics are averaged to calculate the overall score | |
| 4. Models are ranked based on their overall performance | |
| ### How to Prepare Your Model | |
| - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) | |
| - Model should output fixed-dimension embeddings for text | |
| - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) | |
| """ | |
| # Global variables | |
| retrieval_df = None | |
| def load_retrieval_leaderboard(): | |
| """Load and prepare the retrieval leaderboard data""" | |
| global retrieval_df | |
| # Prepare retrieval dataframe | |
| dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json" | |
| retrieval_df = load_json_results( | |
| dataframe_path, True, "Average Score", drop_cols=["Revision", "Task"] | |
| ) | |
| retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df))) | |
| return retrieval_df | |
| def retrieval_search_leaderboard(model_name, columns_to_show): | |
| """Search function for retrieval leaderboard""" | |
| return search_leaderboard(retrieval_df, model_name, columns_to_show) | |
| def update_retrieval_columns_to_show(columns_to_show): | |
| """Update displayed columns for retrieval leaderboard""" | |
| return update_columns_to_show(retrieval_df, columns_to_show) | |
| def create_retrieval_tab(): | |
| """Create the complete retrieval leaderboard tab""" | |
| global retrieval_df | |
| # Load data if not already loaded | |
| if retrieval_df is None: | |
| retrieval_df = load_retrieval_leaderboard() | |
| # Define default columns to show | |
| default_columns = [ | |
| "Rank", | |
| "Model", | |
| "Average Score", | |
| "Model Size (MB)", | |
| "Context Length", | |
| "Embedding Dimension", | |
| "Web Search Dataset", | |
| "Islamic Knowledge Dataset", | |
| ] | |
| # Create and return the tab | |
| return create_leaderboard_tab( | |
| df=retrieval_df, | |
| initial_columns_to_show=default_columns, | |
| search_function=retrieval_search_leaderboard, | |
| update_function=update_retrieval_columns_to_show, | |
| about_section=RETRIEVAL_ABOUT_SECTION, | |
| task_type="Retriever", | |
| ) | |