|
|
|
|
|
""" |
|
|
Comprehensive Test Script - Run 20 Verified Tests on Selected Models |
|
|
All tests use exact match evaluation for deterministic scoring |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import time |
|
|
import pandas as pd |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
import requests |
|
|
from typing import Dict, List, Any, Optional |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
|
|
|
import config |
|
|
|
|
|
|
|
|
config.ensure_dirs() |
|
|
|
|
|
|
|
|
OPENROUTER_API_KEY = config.OPENROUTER_API_KEY |
|
|
OPENROUTER_BASE_URL = config.OPENROUTER_BASE_URL |
|
|
|
|
|
if not OPENROUTER_API_KEY or OPENROUTER_API_KEY == "None": |
|
|
|
|
|
OPENROUTER_API_KEY = "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" |
|
|
|
|
|
print(f"Using API Key: {OPENROUTER_API_KEY[:20]}...") |
|
|
|
|
|
|
|
|
MODEL_SELECTION = "all" |
|
|
|
|
|
|
|
|
if MODEL_SELECTION == "paper_100": |
|
|
try: |
|
|
from models_paper_100_20251013_195518 import PAPER_100_MODELS |
|
|
MODELS_TO_TEST = PAPER_100_MODELS |
|
|
print(f"Using Paper 100 models ({len(MODELS_TO_TEST)} models)") |
|
|
except ImportError: |
|
|
print("Warning: Could not import Paper 100 models, using quick test set") |
|
|
MODELS_TO_TEST = [ |
|
|
"openai/gpt-4o-mini", |
|
|
"anthropic/claude-3.5-sonnet", |
|
|
"google/gemini-2.0-flash-exp:free" |
|
|
] |
|
|
elif MODEL_SELECTION == "quick_3": |
|
|
MODELS_TO_TEST = [ |
|
|
"openai/gpt-4o-mini", |
|
|
"anthropic/claude-3.5-sonnet", |
|
|
"google/gemini-2.0-flash-exp:free" |
|
|
] |
|
|
elif MODEL_SELECTION == "top_10": |
|
|
MODELS_TO_TEST = [ |
|
|
"openai/gpt-4o-mini", |
|
|
"openai/gpt-4-turbo", |
|
|
"anthropic/claude-3.5-sonnet", |
|
|
"anthropic/claude-3-opus", |
|
|
"google/gemini-2.0-flash-exp:free", |
|
|
"meta-llama/llama-3.1-70b-instruct", |
|
|
"mistralai/mistral-large", |
|
|
"qwen/qwen-2.5-72b-instruct", |
|
|
"deepseek/deepseek-chat", |
|
|
"x-ai/grok-2" |
|
|
] |
|
|
else: |
|
|
|
|
|
imported = False |
|
|
|
|
|
|
|
|
import glob |
|
|
model_files = sorted(glob.glob("models_verified_working_v2_*.py"), reverse=True) |
|
|
|
|
|
if model_files: |
|
|
newest_file = model_files[0] |
|
|
module_name = newest_file[:-3] |
|
|
try: |
|
|
exec(f"from {module_name} import VERIFIED_MODELS") |
|
|
MODELS_TO_TEST = locals()['VERIFIED_MODELS'] |
|
|
print(f"Using all verified models from {module_name} ({len(MODELS_TO_TEST)} models)") |
|
|
imported = True |
|
|
except ImportError: |
|
|
pass |
|
|
|
|
|
if not imported: |
|
|
|
|
|
try: |
|
|
from models_verified_working_v2_20251008_134157 import VERIFIED_MODELS as MODELS_TO_TEST |
|
|
print(f"Using all verified models (fallback) ({len(MODELS_TO_TEST)} models)") |
|
|
except ImportError: |
|
|
print("Warning: Could not import verified models, using quick test set") |
|
|
MODELS_TO_TEST = [ |
|
|
"openai/gpt-4o-mini", |
|
|
"anthropic/claude-3.5-sonnet", |
|
|
"google/gemini-2.0-flash-exp:free" |
|
|
] |
|
|
|
|
|
def load_test_questions(): |
|
|
"""Load the 20 verified test questions""" |
|
|
questions_file = Path(__file__).parent / 'questions_20_verified.json' |
|
|
if not questions_file.exists(): |
|
|
print(f"Error: questions_20_verified.json not found at {questions_file}") |
|
|
sys.exit(1) |
|
|
|
|
|
with open(questions_file, 'r') as f: |
|
|
data = json.load(f) |
|
|
return data.get('tests', []) |
|
|
|
|
|
def test_model(model_id: str, test_question: Dict) -> Dict[str, Any]: |
|
|
"""Test a single model with a single question""" |
|
|
headers = { |
|
|
"Authorization": f"Bearer {OPENROUTER_API_KEY}", |
|
|
"Content-Type": "application/json", |
|
|
"HTTP-Referer": "https://github.com/LLM_Instruction_Test", |
|
|
"X-Title": "LLM Instruction Following Research" |
|
|
} |
|
|
|
|
|
payload = { |
|
|
"model": model_id, |
|
|
"messages": [ |
|
|
{"role": "user", "content": test_question['prompt']} |
|
|
], |
|
|
"temperature": 0.0, |
|
|
"max_tokens": 500 |
|
|
} |
|
|
|
|
|
start_time = time.time() |
|
|
try: |
|
|
response = requests.post(OPENROUTER_BASE_URL, json=payload, headers=headers, timeout=30) |
|
|
response_time = time.time() - start_time |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
content = data.get('choices', [{}])[0].get('message', {}).get('content', '') |
|
|
usage = data.get('usage', {}) |
|
|
|
|
|
return { |
|
|
'status': 'success', |
|
|
'response': content, |
|
|
'response_time': response_time, |
|
|
'input_tokens': usage.get('prompt_tokens', 0), |
|
|
'output_tokens': usage.get('completion_tokens', 0), |
|
|
'total_tokens': usage.get('total_tokens', 0) |
|
|
} |
|
|
else: |
|
|
return { |
|
|
'status': 'error', |
|
|
'error': f"HTTP {response.status_code}: {response.text[:200]}", |
|
|
'response_time': response_time |
|
|
} |
|
|
|
|
|
except requests.exceptions.Timeout: |
|
|
return { |
|
|
'status': 'timeout', |
|
|
'error': 'Request timed out after 30 seconds', |
|
|
'response_time': 30.0 |
|
|
} |
|
|
except Exception as e: |
|
|
return { |
|
|
'status': 'error', |
|
|
'error': str(e), |
|
|
'response_time': time.time() - start_time |
|
|
} |
|
|
|
|
|
def evaluate_response(response: str, expected: str) -> bool: |
|
|
"""Evaluate response using exact match""" |
|
|
if response is None or not response: |
|
|
return False |
|
|
|
|
|
|
|
|
response_clean = response.strip() |
|
|
expected_clean = expected.strip() |
|
|
|
|
|
|
|
|
return response_clean == expected_clean |
|
|
|
|
|
def run_comprehensive_test(): |
|
|
"""Run all 20 tests on selected models""" |
|
|
|
|
|
test_questions = load_test_questions() |
|
|
|
|
|
if not test_questions: |
|
|
print("Error: No test questions loaded!") |
|
|
sys.exit(1) |
|
|
|
|
|
total_tests = len(MODELS_TO_TEST) * len(test_questions) |
|
|
results = [] |
|
|
test_count = 0 |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"COMPREHENSIVE EVALUATION: {len(MODELS_TO_TEST)} MODELS × {len(test_questions)} TESTS") |
|
|
print(f"{'='*80}") |
|
|
print(f"Total tests to run: {total_tests}") |
|
|
print(f"Estimated time: {total_tests * 2 / 60:.1f} minutes") |
|
|
print(f"API Key configured: {OPENROUTER_API_KEY[:20]}...") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
for model_idx, model_id in enumerate(MODELS_TO_TEST, 1): |
|
|
print(f"\n[{model_idx}/{len(MODELS_TO_TEST)}] Testing: {model_id}") |
|
|
model_results = [] |
|
|
|
|
|
for test in test_questions: |
|
|
test_count += 1 |
|
|
print(f" Test {test['id']}: {test['name'][:30]}...", end=' ') |
|
|
|
|
|
|
|
|
result = test_model(model_id, test) |
|
|
|
|
|
|
|
|
passed = False |
|
|
if result['status'] == 'success': |
|
|
passed = evaluate_response(result.get('response', ''), test['expected_answer']) |
|
|
|
|
|
|
|
|
test_result = { |
|
|
'model': model_id, |
|
|
'test_id': test['id'], |
|
|
'test_name': test['name'], |
|
|
'category': test.get('category', ''), |
|
|
'passed': passed, |
|
|
'status': result['status'], |
|
|
'response': result.get('response', '')[:500] if result.get('response') else '', |
|
|
'expected': test['expected_answer'][:100], |
|
|
'response_time': result.get('response_time', 0), |
|
|
'input_tokens': result.get('input_tokens', 0), |
|
|
'output_tokens': result.get('output_tokens', 0), |
|
|
'error': result.get('error', '') |
|
|
} |
|
|
|
|
|
results.append(test_result) |
|
|
model_results.append(passed) |
|
|
|
|
|
|
|
|
if passed: |
|
|
print("✅ PASS") |
|
|
elif result['status'] == 'success': |
|
|
print("❌ FAIL") |
|
|
else: |
|
|
print(f"⚠️ {result['status'].upper()}") |
|
|
|
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
passed_count = sum(model_results) |
|
|
pass_rate = (passed_count / len(test_questions)) * 100 |
|
|
print(f" Model Score: {passed_count}/{len(test_questions)} ({pass_rate:.1f}%)") |
|
|
|
|
|
|
|
|
elapsed = time.time() - start_time |
|
|
tests_done = test_count |
|
|
tests_remaining = total_tests - tests_done |
|
|
if tests_done > 0: |
|
|
avg_time = elapsed / tests_done |
|
|
eta = tests_remaining * avg_time |
|
|
print(f" Progress: {tests_done}/{total_tests} | ETA: {eta/60:.1f} min") |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
|
|
save_results(results, test_questions, timestamp) |
|
|
|
|
|
|
|
|
print_summary(results) |
|
|
|
|
|
total_time = time.time() - start_time |
|
|
print(f"\n✅ Testing complete in {total_time/60:.1f} minutes") |
|
|
|
|
|
return results |
|
|
|
|
|
def save_results(results: List[Dict], test_questions: List[Dict], timestamp: str): |
|
|
"""Save results to Excel and JSON""" |
|
|
|
|
|
df = pd.DataFrame(results) |
|
|
|
|
|
|
|
|
pivot = df.pivot_table( |
|
|
index='model', |
|
|
columns='test_id', |
|
|
values='passed', |
|
|
aggfunc='first' |
|
|
) |
|
|
|
|
|
|
|
|
model_scores = df.groupby('model')['passed'].mean() * 100 |
|
|
test_scores = df.groupby('test_id')['passed'].mean() * 100 |
|
|
category_scores = df.groupby('category')['passed'].mean() * 100 |
|
|
|
|
|
|
|
|
excel_file = f'comprehensive_20_tests_results_{timestamp}.xlsx' |
|
|
with pd.ExcelWriter(excel_file, engine='openpyxl') as writer: |
|
|
|
|
|
pivot.to_excel(writer, sheet_name='Overview') |
|
|
|
|
|
|
|
|
model_scores.sort_values(ascending=False).to_frame('Pass Rate (%)').to_excel( |
|
|
writer, sheet_name='Model Rankings' |
|
|
) |
|
|
|
|
|
|
|
|
test_difficulty = pd.DataFrame({ |
|
|
'Test ID': test_scores.index, |
|
|
'Pass Rate (%)': test_scores.values |
|
|
}) |
|
|
test_difficulty = test_difficulty.merge( |
|
|
pd.DataFrame(test_questions)[['id', 'name', 'category']], |
|
|
left_on='Test ID', right_on='id' |
|
|
) |
|
|
test_difficulty.sort_values('Pass Rate (%)').to_excel( |
|
|
writer, sheet_name='Test Difficulty', index=False |
|
|
) |
|
|
|
|
|
|
|
|
category_scores.sort_values().to_frame('Pass Rate (%)').to_excel( |
|
|
writer, sheet_name='Category Performance' |
|
|
) |
|
|
|
|
|
|
|
|
df.to_excel(writer, sheet_name='All Results', index=False) |
|
|
|
|
|
|
|
|
test_desc_df = pd.DataFrame(test_questions) |
|
|
test_desc_df.to_excel(writer, sheet_name='Test Descriptions', index=False) |
|
|
|
|
|
print(f"\n📊 Excel results saved to: {excel_file}") |
|
|
|
|
|
|
|
|
json_file = f'comprehensive_20_tests_results_{timestamp}.json' |
|
|
with open(json_file, 'w') as f: |
|
|
json.dump({ |
|
|
'metadata': { |
|
|
'timestamp': timestamp, |
|
|
'total_models': len(MODELS_TO_TEST), |
|
|
'total_tests': len(test_questions), |
|
|
'total_results': len(results), |
|
|
'model_selection': MODEL_SELECTION |
|
|
}, |
|
|
'summary': { |
|
|
'overall_pass_rate': float(df['passed'].mean() * 100), |
|
|
'best_model': str(model_scores.idxmax()), |
|
|
'best_model_score': float(model_scores.max()), |
|
|
'hardest_test': int(test_scores.idxmin()), |
|
|
'hardest_test_pass_rate': float(test_scores.min()) |
|
|
}, |
|
|
'results': results |
|
|
}, f, indent=2) |
|
|
|
|
|
print(f"📄 JSON results saved to: {json_file}") |
|
|
|
|
|
def print_summary(results: List[Dict]): |
|
|
"""Print summary statistics""" |
|
|
df = pd.DataFrame(results) |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print("SUMMARY STATISTICS") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
|
|
|
overall_pass_rate = df['passed'].mean() * 100 |
|
|
print(f"\nOverall Pass Rate: {overall_pass_rate:.1f}%") |
|
|
|
|
|
|
|
|
success_rate = (df['status'] == 'success').mean() * 100 |
|
|
print(f"API Success Rate: {success_rate:.1f}%") |
|
|
|
|
|
|
|
|
model_scores = df.groupby('model')['passed'].mean() |
|
|
top_models = model_scores.nlargest(5) |
|
|
print("\nTop 5 Models:") |
|
|
for model, score in top_models.items(): |
|
|
passed = df[(df['model'] == model) & df['passed']].shape[0] |
|
|
total = df[df['model'] == model].shape[0] |
|
|
print(f" {score*100:5.1f}% ({passed}/{total}) - {model}") |
|
|
|
|
|
|
|
|
test_scores = df.groupby('test_name')['passed'].mean() |
|
|
hardest = test_scores.nsmallest(5) |
|
|
print("\nHardest Tests (lowest pass rate):") |
|
|
for test, score in hardest.items(): |
|
|
print(f" {score*100:5.1f}% - {test[:50]}...") |
|
|
|
|
|
|
|
|
easiest = test_scores.nlargest(5) |
|
|
print("\nEasiest Tests (highest pass rate):") |
|
|
for test, score in easiest.items(): |
|
|
print(f" {score*100:5.1f}% - {test[:50]}...") |
|
|
|
|
|
|
|
|
if 'category' in df.columns and df['category'].notna().any(): |
|
|
category_scores = df.groupby('category')['passed'].mean() |
|
|
print("\nPerformance by Category:") |
|
|
for category, score in category_scores.sort_values().items(): |
|
|
if category: |
|
|
tests_in_cat = df[df['category'] == category]['test_id'].nunique() |
|
|
print(f" {score*100:5.1f}% - {category} ({tests_in_cat} tests)") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
|
|
|
if not OPENROUTER_API_KEY or len(OPENROUTER_API_KEY) < 20: |
|
|
print("Error: OPENROUTER_API_KEY not properly configured") |
|
|
print("Please set the API key in config.py or as an environment variable") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
results = run_comprehensive_test() |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
print("\n\n⚠️ Test interrupted by user") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
print(f"\n❌ Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |