Spaces:
Build error
Build error
| import os | |
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature | |
| def process_and_push_dataset( | |
| data_dir: str, hub_repo: str, token: str, private: bool = True | |
| ): | |
| """ | |
| Process local dataset files and push to Hugging Face Hub. | |
| Args: | |
| data_dir (str): Path to the data directory containing submission folders | |
| hub_repo (str): Name of the Hugging Face repository to push to | |
| private (bool): Whether to make the pushed dataset private | |
| Returns: | |
| datasets.Dataset: The processed dataset | |
| """ | |
| # List to store all records | |
| all_records = [] | |
| # Walk through all subdirectories in data_dir | |
| for root, dirs, files in os.walk(data_dir): | |
| for file in files: | |
| if file == "question.json": | |
| file_path = Path(root) / file | |
| try: | |
| # Read the JSON file | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| record = json.load(f) | |
| # Get the folder path for this record | |
| folder_path = os.path.dirname(file_path) | |
| # Fix image paths to include full path | |
| if "question_images" in record: | |
| record["question_images"] = [ | |
| str(Path(folder_path) / img_path) | |
| for img_path in record["question_images"] | |
| if img_path | |
| ] | |
| if "rationale_images" in record: | |
| record["rationale_images"] = [ | |
| str(Path(folder_path) / img_path) | |
| for img_path in record["rationale_images"] | |
| if img_path | |
| ] | |
| # Flatten author_info dictionary | |
| author_info = record.pop("author_info", {}) | |
| record.update( | |
| {f"author_{k}": v for k, v in author_info.items()} | |
| ) | |
| # Add the record | |
| all_records.append(record) | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {e}") | |
| # Convert to DataFrame | |
| df = pd.DataFrame(all_records) | |
| # Sort by custom_id for consistency | |
| if not df.empty and "custom_id" in df.columns: | |
| df = df.sort_values("custom_id") | |
| # Ensure all required columns exist with default values | |
| required_columns = { | |
| "custom_id": "", | |
| "author_name": "", | |
| "author_email_address": "", | |
| "author_institution": "", | |
| "question_categories": [], | |
| "question": "", | |
| "question_images": [], | |
| "final_answer": "", | |
| "rationale_text": "", | |
| "rationale_images": [], | |
| "image_attribution": "", | |
| "subquestions_1_text": "", | |
| "subquestions_1_answer": "", | |
| "subquestions_2_text": "", | |
| "subquestions_2_answer": "", | |
| "subquestions_3_text": "", | |
| "subquestions_3_answer": "", | |
| "subquestions_4_text": "", | |
| "subquestions_4_answer": "", | |
| "subquestions_5_text": "", | |
| "subquestions_5_answer": "", | |
| } | |
| for col, default_value in required_columns.items(): | |
| if col not in df.columns: | |
| df[col] = default_value | |
| # Define features | |
| features = Features( | |
| { | |
| "custom_id": Value("string"), | |
| "question": Value("string"), | |
| "question_images": Sequence(ImageFeature()), | |
| "question_categories": Sequence(Value("string")), | |
| "final_answer": Value("string"), | |
| "rationale_text": Value("string"), | |
| "rationale_images": Sequence(ImageFeature()), | |
| "image_attribution": Value("string"), | |
| "subquestions_1_text": Value("string"), | |
| "subquestions_1_answer": Value("string"), | |
| "subquestions_2_text": Value("string"), | |
| "subquestions_2_answer": Value("string"), | |
| "subquestions_3_text": Value("string"), | |
| "subquestions_3_answer": Value("string"), | |
| "subquestions_4_text": Value("string"), | |
| "subquestions_4_answer": Value("string"), | |
| "subquestions_5_text": Value("string"), | |
| "subquestions_5_answer": Value("string"), | |
| "author_name": Value("string"), | |
| "author_email_address": Value("string"), | |
| "author_institution": Value("string"), | |
| } | |
| ) | |
| # Convert DataFrame to dict of lists (Hugging Face Dataset format) | |
| dataset_dict = {col: df[col].tolist() for col in features.keys()} | |
| # Create Dataset directly from dict | |
| dataset = Dataset.from_dict(dataset_dict, features=features) | |
| # Push to hub | |
| dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token) | |
| print(f"\nDataset Statistics:") | |
| print(f"Total number of submissions: {len(dataset)}") | |
| print(f"\nSuccessfully pushed dataset to {hub_repo}") | |
| return dataset | |