llm-instruction-following-code / test_comprehensive_20_verified.py
richardyoung's picture
Upload test_comprehensive_20_verified.py with huggingface_hub
78e16fe verified
raw
history blame
14.7 kB
#!/usr/bin/env python3
"""
Comprehensive Test Script - Run 20 Verified Tests on Selected Models
All tests use exact match evaluation for deterministic scoring
"""
import os
import sys
import json
import time
import pandas as pd
from datetime import datetime
from pathlib import Path
import requests
from typing import Dict, List, Any, Optional
# Add current directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
# Import configuration
import config
# Ensure directories exist
config.ensure_dirs()
# API Configuration - use the hardcoded key from config
OPENROUTER_API_KEY = config.OPENROUTER_API_KEY
OPENROUTER_BASE_URL = config.OPENROUTER_BASE_URL
if not OPENROUTER_API_KEY or OPENROUTER_API_KEY == "None":
# Fallback to hardcoded key if env var not set
OPENROUTER_API_KEY = "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"
print(f"Using API Key: {OPENROUTER_API_KEY[:20]}...")
# Model selection options
MODEL_SELECTION = "all" # Options: "all", "paper_100", "top_10", "quick_3"
# Import models based on selection
if MODEL_SELECTION == "paper_100":
try:
from models_paper_100_20251013_195518 import PAPER_100_MODELS
MODELS_TO_TEST = PAPER_100_MODELS
print(f"Using Paper 100 models ({len(MODELS_TO_TEST)} models)")
except ImportError:
print("Warning: Could not import Paper 100 models, using quick test set")
MODELS_TO_TEST = [
"openai/gpt-4o-mini",
"anthropic/claude-3.5-sonnet",
"google/gemini-2.0-flash-exp:free"
]
elif MODEL_SELECTION == "quick_3":
MODELS_TO_TEST = [
"openai/gpt-4o-mini",
"anthropic/claude-3.5-sonnet",
"google/gemini-2.0-flash-exp:free"
]
elif MODEL_SELECTION == "top_10":
MODELS_TO_TEST = [
"openai/gpt-4o-mini",
"openai/gpt-4-turbo",
"anthropic/claude-3.5-sonnet",
"anthropic/claude-3-opus",
"google/gemini-2.0-flash-exp:free",
"meta-llama/llama-3.1-70b-instruct",
"mistralai/mistral-large",
"qwen/qwen-2.5-72b-instruct",
"deepseek/deepseek-chat",
"x-ai/grok-2"
]
else:
# Import all verified models - try newest first, then fallback
imported = False
# First try to find the newest verified models file
import glob
model_files = sorted(glob.glob("models_verified_working_v2_*.py"), reverse=True)
if model_files:
newest_file = model_files[0]
module_name = newest_file[:-3] # Remove .py extension
try:
exec(f"from {module_name} import VERIFIED_MODELS")
MODELS_TO_TEST = locals()['VERIFIED_MODELS']
print(f"Using all verified models from {module_name} ({len(MODELS_TO_TEST)} models)")
imported = True
except ImportError:
pass
if not imported:
# Fallback to known file
try:
from models_verified_working_v2_20251008_134157 import VERIFIED_MODELS as MODELS_TO_TEST
print(f"Using all verified models (fallback) ({len(MODELS_TO_TEST)} models)")
except ImportError:
print("Warning: Could not import verified models, using quick test set")
MODELS_TO_TEST = [
"openai/gpt-4o-mini",
"anthropic/claude-3.5-sonnet",
"google/gemini-2.0-flash-exp:free"
]
def load_test_questions():
"""Load the 20 verified test questions"""
questions_file = Path(__file__).parent / 'questions_20_verified.json'
if not questions_file.exists():
print(f"Error: questions_20_verified.json not found at {questions_file}")
sys.exit(1)
with open(questions_file, 'r') as f:
data = json.load(f)
return data.get('tests', [])
def test_model(model_id: str, test_question: Dict) -> Dict[str, Any]:
"""Test a single model with a single question"""
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
"HTTP-Referer": "https://github.com/LLM_Instruction_Test",
"X-Title": "LLM Instruction Following Research"
}
payload = {
"model": model_id,
"messages": [
{"role": "user", "content": test_question['prompt']}
],
"temperature": 0.0,
"max_tokens": 500
}
start_time = time.time()
try:
response = requests.post(OPENROUTER_BASE_URL, json=payload, headers=headers, timeout=30)
response_time = time.time() - start_time
if response.status_code == 200:
data = response.json()
content = data.get('choices', [{}])[0].get('message', {}).get('content', '')
usage = data.get('usage', {})
return {
'status': 'success',
'response': content,
'response_time': response_time,
'input_tokens': usage.get('prompt_tokens', 0),
'output_tokens': usage.get('completion_tokens', 0),
'total_tokens': usage.get('total_tokens', 0)
}
else:
return {
'status': 'error',
'error': f"HTTP {response.status_code}: {response.text[:200]}",
'response_time': response_time
}
except requests.exceptions.Timeout:
return {
'status': 'timeout',
'error': 'Request timed out after 30 seconds',
'response_time': 30.0
}
except Exception as e:
return {
'status': 'error',
'error': str(e),
'response_time': time.time() - start_time
}
def evaluate_response(response: str, expected: str) -> bool:
"""Evaluate response using exact match"""
if response is None or not response:
return False
# Strip whitespace from both
response_clean = response.strip()
expected_clean = expected.strip()
# Direct exact match
return response_clean == expected_clean
def run_comprehensive_test():
"""Run all 20 tests on selected models"""
# Load test questions
test_questions = load_test_questions()
if not test_questions:
print("Error: No test questions loaded!")
sys.exit(1)
total_tests = len(MODELS_TO_TEST) * len(test_questions)
results = []
test_count = 0
print(f"\n{'='*80}")
print(f"COMPREHENSIVE EVALUATION: {len(MODELS_TO_TEST)} MODELS × {len(test_questions)} TESTS")
print(f"{'='*80}")
print(f"Total tests to run: {total_tests}")
print(f"Estimated time: {total_tests * 2 / 60:.1f} minutes")
print(f"API Key configured: {OPENROUTER_API_KEY[:20]}...")
print(f"{'='*80}\n")
start_time = time.time()
for model_idx, model_id in enumerate(MODELS_TO_TEST, 1):
print(f"\n[{model_idx}/{len(MODELS_TO_TEST)}] Testing: {model_id}")
model_results = []
for test in test_questions:
test_count += 1
print(f" Test {test['id']}: {test['name'][:30]}...", end=' ')
# Run the test
result = test_model(model_id, test)
# Evaluate the response
passed = False
if result['status'] == 'success':
passed = evaluate_response(result.get('response', ''), test['expected_answer'])
# Store result
test_result = {
'model': model_id,
'test_id': test['id'],
'test_name': test['name'],
'category': test.get('category', ''),
'passed': passed,
'status': result['status'],
'response': result.get('response', '')[:500] if result.get('response') else '',
'expected': test['expected_answer'][:100],
'response_time': result.get('response_time', 0),
'input_tokens': result.get('input_tokens', 0),
'output_tokens': result.get('output_tokens', 0),
'error': result.get('error', '')
}
results.append(test_result)
model_results.append(passed)
# Print result
if passed:
print("✅ PASS")
elif result['status'] == 'success':
print("❌ FAIL")
else:
print(f"⚠️ {result['status'].upper()}")
# Rate limiting protection
time.sleep(0.5)
# Model summary
passed_count = sum(model_results)
pass_rate = (passed_count / len(test_questions)) * 100
print(f" Model Score: {passed_count}/{len(test_questions)} ({pass_rate:.1f}%)")
# Progress update
elapsed = time.time() - start_time
tests_done = test_count
tests_remaining = total_tests - tests_done
if tests_done > 0:
avg_time = elapsed / tests_done
eta = tests_remaining * avg_time
print(f" Progress: {tests_done}/{total_tests} | ETA: {eta/60:.1f} min")
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_results(results, test_questions, timestamp)
# Print summary
print_summary(results)
total_time = time.time() - start_time
print(f"\n✅ Testing complete in {total_time/60:.1f} minutes")
return results
def save_results(results: List[Dict], test_questions: List[Dict], timestamp: str):
"""Save results to Excel and JSON"""
# Create DataFrame
df = pd.DataFrame(results)
# Create pivot table for overview
pivot = df.pivot_table(
index='model',
columns='test_id',
values='passed',
aggfunc='first'
)
# Calculate summary statistics
model_scores = df.groupby('model')['passed'].mean() * 100
test_scores = df.groupby('test_id')['passed'].mean() * 100
category_scores = df.groupby('category')['passed'].mean() * 100
# Save to Excel
excel_file = f'comprehensive_20_tests_results_{timestamp}.xlsx'
with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
# Overview sheet - pass/fail matrix
pivot.to_excel(writer, sheet_name='Overview')
# Model rankings
model_scores.sort_values(ascending=False).to_frame('Pass Rate (%)').to_excel(
writer, sheet_name='Model Rankings'
)
# Test difficulty
test_difficulty = pd.DataFrame({
'Test ID': test_scores.index,
'Pass Rate (%)': test_scores.values
})
test_difficulty = test_difficulty.merge(
pd.DataFrame(test_questions)[['id', 'name', 'category']],
left_on='Test ID', right_on='id'
)
test_difficulty.sort_values('Pass Rate (%)').to_excel(
writer, sheet_name='Test Difficulty', index=False
)
# Category performance
category_scores.sort_values().to_frame('Pass Rate (%)').to_excel(
writer, sheet_name='Category Performance'
)
# All results
df.to_excel(writer, sheet_name='All Results', index=False)
# Test descriptions
test_desc_df = pd.DataFrame(test_questions)
test_desc_df.to_excel(writer, sheet_name='Test Descriptions', index=False)
print(f"\n📊 Excel results saved to: {excel_file}")
# Also save JSON for programmatic access
json_file = f'comprehensive_20_tests_results_{timestamp}.json'
with open(json_file, 'w') as f:
json.dump({
'metadata': {
'timestamp': timestamp,
'total_models': len(MODELS_TO_TEST),
'total_tests': len(test_questions),
'total_results': len(results),
'model_selection': MODEL_SELECTION
},
'summary': {
'overall_pass_rate': float(df['passed'].mean() * 100),
'best_model': str(model_scores.idxmax()),
'best_model_score': float(model_scores.max()),
'hardest_test': int(test_scores.idxmin()),
'hardest_test_pass_rate': float(test_scores.min())
},
'results': results
}, f, indent=2)
print(f"📄 JSON results saved to: {json_file}")
def print_summary(results: List[Dict]):
"""Print summary statistics"""
df = pd.DataFrame(results)
print(f"\n{'='*80}")
print("SUMMARY STATISTICS")
print(f"{'='*80}")
# Overall pass rate
overall_pass_rate = df['passed'].mean() * 100
print(f"\nOverall Pass Rate: {overall_pass_rate:.1f}%")
# Success rate (tests that didn't error/timeout)
success_rate = (df['status'] == 'success').mean() * 100
print(f"API Success Rate: {success_rate:.1f}%")
# Top models
model_scores = df.groupby('model')['passed'].mean()
top_models = model_scores.nlargest(5)
print("\nTop 5 Models:")
for model, score in top_models.items():
passed = df[(df['model'] == model) & df['passed']].shape[0]
total = df[df['model'] == model].shape[0]
print(f" {score*100:5.1f}% ({passed}/{total}) - {model}")
# Hardest tests
test_scores = df.groupby('test_name')['passed'].mean()
hardest = test_scores.nsmallest(5)
print("\nHardest Tests (lowest pass rate):")
for test, score in hardest.items():
print(f" {score*100:5.1f}% - {test[:50]}...")
# Easiest tests
easiest = test_scores.nlargest(5)
print("\nEasiest Tests (highest pass rate):")
for test, score in easiest.items():
print(f" {score*100:5.1f}% - {test[:50]}...")
# Category performance
if 'category' in df.columns and df['category'].notna().any():
category_scores = df.groupby('category')['passed'].mean()
print("\nPerformance by Category:")
for category, score in category_scores.sort_values().items():
if category: # Skip empty categories
tests_in_cat = df[df['category'] == category]['test_id'].nunique()
print(f" {score*100:5.1f}% - {category} ({tests_in_cat} tests)")
if __name__ == "__main__":
try:
# Verify API key is configured
if not OPENROUTER_API_KEY or len(OPENROUTER_API_KEY) < 20:
print("Error: OPENROUTER_API_KEY not properly configured")
print("Please set the API key in config.py or as an environment variable")
sys.exit(1)
# Run the comprehensive test
results = run_comprehensive_test()
except KeyboardInterrupt:
print("\n\n⚠️ Test interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)