llm-instruction-following-code / test_comprehensive_20_verified.py

Upload test_comprehensive_20_verified.py with huggingface_hub

78e16fe verified 2 months ago

14.7 kB

	#!/usr/bin/env python3
	"""
	Comprehensive Test Script - Run 20 Verified Tests on Selected Models
	All tests use exact match evaluation for deterministic scoring
	"""

	import os
	import sys
	import json
	import time
	import pandas as pd
	from datetime import datetime
	from pathlib import Path
	import requests
	from typing import Dict, List, Any, Optional

	# Add current directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	# Import configuration
	import config

	# Ensure directories exist
	config.ensure_dirs()

	# API Configuration - use the hardcoded key from config
	OPENROUTER_API_KEY = config.OPENROUTER_API_KEY
	OPENROUTER_BASE_URL = config.OPENROUTER_BASE_URL

	if not OPENROUTER_API_KEY or OPENROUTER_API_KEY == "None":
	# Fallback to hardcoded key if env var not set
	OPENROUTER_API_KEY = "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"

	print(f"Using API Key: {OPENROUTER_API_KEY[:20]}...")

	# Model selection options
	MODEL_SELECTION = "all" # Options: "all", "paper_100", "top_10", "quick_3"

	# Import models based on selection
	if MODEL_SELECTION == "paper_100":
	try:
	from models_paper_100_20251013_195518 import PAPER_100_MODELS
	MODELS_TO_TEST = PAPER_100_MODELS
	print(f"Using Paper 100 models ({len(MODELS_TO_TEST)} models)")
	except ImportError:
	print("Warning: Could not import Paper 100 models, using quick test set")
	MODELS_TO_TEST = [
	"openai/gpt-4o-mini",
	"anthropic/claude-3.5-sonnet",
	"google/gemini-2.0-flash-exp:free"
	]
	elif MODEL_SELECTION == "quick_3":
	MODELS_TO_TEST = [
	"openai/gpt-4o-mini",
	"anthropic/claude-3.5-sonnet",
	"google/gemini-2.0-flash-exp:free"
	]
	elif MODEL_SELECTION == "top_10":
	MODELS_TO_TEST = [
	"openai/gpt-4o-mini",
	"openai/gpt-4-turbo",
	"anthropic/claude-3.5-sonnet",
	"anthropic/claude-3-opus",
	"google/gemini-2.0-flash-exp:free",
	"meta-llama/llama-3.1-70b-instruct",
	"mistralai/mistral-large",
	"qwen/qwen-2.5-72b-instruct",
	"deepseek/deepseek-chat",
	"x-ai/grok-2"
	]
	else:
	# Import all verified models - try newest first, then fallback
	imported = False

	# First try to find the newest verified models file
	import glob
	model_files = sorted(glob.glob("models_verified_working_v2_*.py"), reverse=True)

	if model_files:
	newest_file = model_files[0]
	module_name = newest_file[:-3] # Remove .py extension
	try:
	exec(f"from {module_name} import VERIFIED_MODELS")
	MODELS_TO_TEST = locals()['VERIFIED_MODELS']
	print(f"Using all verified models from {module_name} ({len(MODELS_TO_TEST)} models)")
	imported = True
	except ImportError:
	pass

	if not imported:
	# Fallback to known file
	try:
	from models_verified_working_v2_20251008_134157 import VERIFIED_MODELS as MODELS_TO_TEST
	print(f"Using all verified models (fallback) ({len(MODELS_TO_TEST)} models)")
	except ImportError:
	print("Warning: Could not import verified models, using quick test set")
	MODELS_TO_TEST = [
	"openai/gpt-4o-mini",
	"anthropic/claude-3.5-sonnet",
	"google/gemini-2.0-flash-exp:free"
	]

	def load_test_questions():
	"""Load the 20 verified test questions"""
	questions_file = Path(__file__).parent / 'questions_20_verified.json'
	if not questions_file.exists():
	print(f"Error: questions_20_verified.json not found at {questions_file}")
	sys.exit(1)

	with open(questions_file, 'r') as f:
	data = json.load(f)
	return data.get('tests', [])

	def test_model(model_id: str, test_question: Dict) -> Dict[str, Any]:
	"""Test a single model with a single question"""
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	"HTTP-Referer": "https://github.com/LLM_Instruction_Test",
	"X-Title": "LLM Instruction Following Research"
	}

	payload = {
	"model": model_id,
	"messages": [
	{"role": "user", "content": test_question['prompt']}
	],
	"temperature": 0.0,
	"max_tokens": 500
	}

	start_time = time.time()
	try:
	response = requests.post(OPENROUTER_BASE_URL, json=payload, headers=headers, timeout=30)
	response_time = time.time() - start_time

	if response.status_code == 200:
	data = response.json()
	content = data.get('choices', [{}])[0].get('message', {}).get('content', '')
	usage = data.get('usage', {})

	return {
	'status': 'success',
	'response': content,
	'response_time': response_time,
	'input_tokens': usage.get('prompt_tokens', 0),
	'output_tokens': usage.get('completion_tokens', 0),
	'total_tokens': usage.get('total_tokens', 0)
	}
	else:
	return {
	'status': 'error',
	'error': f"HTTP {response.status_code}: {response.text[:200]}",
	'response_time': response_time
	}

	except requests.exceptions.Timeout:
	return {
	'status': 'timeout',
	'error': 'Request timed out after 30 seconds',
	'response_time': 30.0
	}
	except Exception as e:
	return {
	'status': 'error',
	'error': str(e),
	'response_time': time.time() - start_time
	}

	def evaluate_response(response: str, expected: str) -> bool:
	"""Evaluate response using exact match"""
	if response is None or not response:
	return False

	# Strip whitespace from both
	response_clean = response.strip()
	expected_clean = expected.strip()

	# Direct exact match
	return response_clean == expected_clean

	def run_comprehensive_test():
	"""Run all 20 tests on selected models"""
	# Load test questions
	test_questions = load_test_questions()

	if not test_questions:
	print("Error: No test questions loaded!")
	sys.exit(1)

	total_tests = len(MODELS_TO_TEST) * len(test_questions)
	results = []
	test_count = 0

	print(f"\n{'='*80}")
	print(f"COMPREHENSIVE EVALUATION: {len(MODELS_TO_TEST)} MODELS × {len(test_questions)} TESTS")
	print(f"{'='*80}")
	print(f"Total tests to run: {total_tests}")
	print(f"Estimated time: {total_tests * 2 / 60:.1f} minutes")
	print(f"API Key configured: {OPENROUTER_API_KEY[:20]}...")
	print(f"{'='*80}\n")

	start_time = time.time()

	for model_idx, model_id in enumerate(MODELS_TO_TEST, 1):
	print(f"\n[{model_idx}/{len(MODELS_TO_TEST)}] Testing: {model_id}")
	model_results = []

	for test in test_questions:
	test_count += 1
	print(f" Test {test['id']}: {test['name'][:30]}...", end=' ')

	# Run the test
	result = test_model(model_id, test)

	# Evaluate the response
	passed = False
	if result['status'] == 'success':
	passed = evaluate_response(result.get('response', ''), test['expected_answer'])

	# Store result
	test_result = {
	'model': model_id,
	'test_id': test['id'],
	'test_name': test['name'],
	'category': test.get('category', ''),
	'passed': passed,
	'status': result['status'],
	'response': result.get('response', '')[:500] if result.get('response') else '',
	'expected': test['expected_answer'][:100],
	'response_time': result.get('response_time', 0),
	'input_tokens': result.get('input_tokens', 0),
	'output_tokens': result.get('output_tokens', 0),
	'error': result.get('error', '')
	}

	results.append(test_result)
	model_results.append(passed)

	# Print result
	if passed:
	print("✅ PASS")
	elif result['status'] == 'success':
	print("❌ FAIL")
	else:
	print(f"⚠️ {result['status'].upper()}")

	# Rate limiting protection
	time.sleep(0.5)

	# Model summary
	passed_count = sum(model_results)
	pass_rate = (passed_count / len(test_questions)) * 100
	print(f" Model Score: {passed_count}/{len(test_questions)} ({pass_rate:.1f}%)")

	# Progress update
	elapsed = time.time() - start_time
	tests_done = test_count
	tests_remaining = total_tests - tests_done
	if tests_done > 0:
	avg_time = elapsed / tests_done
	eta = tests_remaining * avg_time
	print(f" Progress: {tests_done}/{total_tests} \| ETA: {eta/60:.1f} min")

	# Save results
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	save_results(results, test_questions, timestamp)

	# Print summary
	print_summary(results)

	total_time = time.time() - start_time
	print(f"\n✅ Testing complete in {total_time/60:.1f} minutes")

	return results

	def save_results(results: List[Dict], test_questions: List[Dict], timestamp: str):
	"""Save results to Excel and JSON"""
	# Create DataFrame
	df = pd.DataFrame(results)

	# Create pivot table for overview
	pivot = df.pivot_table(
	index='model',
	columns='test_id',
	values='passed',
	aggfunc='first'
	)

	# Calculate summary statistics
	model_scores = df.groupby('model')['passed'].mean() * 100
	test_scores = df.groupby('test_id')['passed'].mean() * 100
	category_scores = df.groupby('category')['passed'].mean() * 100

	# Save to Excel
	excel_file = f'comprehensive_20_tests_results_{timestamp}.xlsx'
	with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
	# Overview sheet - pass/fail matrix
	pivot.to_excel(writer, sheet_name='Overview')

	# Model rankings
	model_scores.sort_values(ascending=False).to_frame('Pass Rate (%)').to_excel(
	writer, sheet_name='Model Rankings'
	)

	# Test difficulty
	test_difficulty = pd.DataFrame({
	'Test ID': test_scores.index,
	'Pass Rate (%)': test_scores.values
	})
	test_difficulty = test_difficulty.merge(
	pd.DataFrame(test_questions)[['id', 'name', 'category']],
	left_on='Test ID', right_on='id'
	)
	test_difficulty.sort_values('Pass Rate (%)').to_excel(
	writer, sheet_name='Test Difficulty', index=False
	)

	# Category performance
	category_scores.sort_values().to_frame('Pass Rate (%)').to_excel(
	writer, sheet_name='Category Performance'
	)

	# All results
	df.to_excel(writer, sheet_name='All Results', index=False)

	# Test descriptions
	test_desc_df = pd.DataFrame(test_questions)
	test_desc_df.to_excel(writer, sheet_name='Test Descriptions', index=False)

	print(f"\n📊 Excel results saved to: {excel_file}")

	# Also save JSON for programmatic access
	json_file = f'comprehensive_20_tests_results_{timestamp}.json'
	with open(json_file, 'w') as f:
	json.dump({
	'metadata': {
	'timestamp': timestamp,
	'total_models': len(MODELS_TO_TEST),
	'total_tests': len(test_questions),
	'total_results': len(results),
	'model_selection': MODEL_SELECTION
	},
	'summary': {
	'overall_pass_rate': float(df['passed'].mean() * 100),
	'best_model': str(model_scores.idxmax()),
	'best_model_score': float(model_scores.max()),
	'hardest_test': int(test_scores.idxmin()),
	'hardest_test_pass_rate': float(test_scores.min())
	},
	'results': results
	}, f, indent=2)

	print(f"📄 JSON results saved to: {json_file}")

	def print_summary(results: List[Dict]):
	"""Print summary statistics"""
	df = pd.DataFrame(results)

	print(f"\n{'='*80}")
	print("SUMMARY STATISTICS")
	print(f"{'='*80}")

	# Overall pass rate
	overall_pass_rate = df['passed'].mean() * 100
	print(f"\nOverall Pass Rate: {overall_pass_rate:.1f}%")

	# Success rate (tests that didn't error/timeout)
	success_rate = (df['status'] == 'success').mean() * 100
	print(f"API Success Rate: {success_rate:.1f}%")

	# Top models
	model_scores = df.groupby('model')['passed'].mean()
	top_models = model_scores.nlargest(5)
	print("\nTop 5 Models:")
	for model, score in top_models.items():
	passed = df[(df['model'] == model) & df['passed']].shape[0]
	total = df[df['model'] == model].shape[0]
	print(f" {score*100:5.1f}% ({passed}/{total}) - {model}")

	# Hardest tests
	test_scores = df.groupby('test_name')['passed'].mean()
	hardest = test_scores.nsmallest(5)
	print("\nHardest Tests (lowest pass rate):")
	for test, score in hardest.items():
	print(f" {score*100:5.1f}% - {test[:50]}...")

	# Easiest tests
	easiest = test_scores.nlargest(5)
	print("\nEasiest Tests (highest pass rate):")
	for test, score in easiest.items():
	print(f" {score*100:5.1f}% - {test[:50]}...")

	# Category performance
	if 'category' in df.columns and df['category'].notna().any():
	category_scores = df.groupby('category')['passed'].mean()
	print("\nPerformance by Category:")
	for category, score in category_scores.sort_values().items():
	if category: # Skip empty categories
	tests_in_cat = df[df['category'] == category]['test_id'].nunique()
	print(f" {score*100:5.1f}% - {category} ({tests_in_cat} tests)")

	if __name__ == "__main__":
	try:
	# Verify API key is configured
	if not OPENROUTER_API_KEY or len(OPENROUTER_API_KEY) < 20:
	print("Error: OPENROUTER_API_KEY not properly configured")
	print("Please set the API key in config.py or as an environment variable")
	sys.exit(1)

	# Run the comprehensive test
	results = run_comprehensive_test()

	except KeyboardInterrupt:
	print("\n\n⚠️ Test interrupted by user")
	sys.exit(1)
	except Exception as e:
	print(f"\n❌ Error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)