forked from Mercidaiha/IRT-Router
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnaive_routing_analysis.py
More file actions
104 lines (81 loc) · 3.67 KB
/
Copy pathnaive_routing_analysis.py
File metadata and controls
104 lines (81 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
"""
Naive (Random) Routing Analysis
Calculates the expected accuracy of a router that randomly selects
a model for each prompt using probability theory.
For each prompt (original_id):
- If there are N models and K have performance=1.0
- Probability of selecting a correct model = K/N
- Expected accuracy = sum of all probabilities / total prompts
"""
import pandas as pd
import sys
import csv
csv.field_size_limit(sys.maxsize)
def analyze_naive_routing(csv_path: str) -> None:
print(f"Loading data from: {csv_path}")
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Total rows: {len(df)}")
# Get unique original_ids
unique_ids = df['original_id'].unique()
total_prompts = len(unique_ids)
print(f"Unique prompts (original_id): {total_prompts}")
# Get unique models
unique_models = df['llm'].unique()
num_models = len(unique_models)
print(f"Unique models: {num_models}")
# Calculate expected accuracy using probability
# For each prompt: P(correct) = (# correct models) / (# total models for that prompt)
# Expected total correct = sum of P(correct) for all prompts
# Expected accuracy = Expected total correct / total prompts
expected_correct = 0.0
probability_details = []
for orig_id in unique_ids:
group = df[df['original_id'] == orig_id]
total_models_for_prompt = len(group)
correct_models = (group['performance'] == 1.0).sum()
# Probability of randomly selecting a correct model for this prompt
p_correct = correct_models / total_models_for_prompt
expected_correct += p_correct
probability_details.append({
'original_id': orig_id,
'total_models': total_models_for_prompt,
'correct_models': correct_models,
'p_correct': p_correct
})
expected_accuracy = (expected_correct / total_prompts) * 100
print("\n" + "="*50)
print("NAIVE (RANDOM) ROUTING ANALYSIS RESULTS")
print("="*50)
print(f"Total unique prompts: {total_prompts}")
print(f"Expected number of correct answers: {expected_correct:.2f}")
print(f"\nNaive Router Expected Accuracy: {expected_accuracy:.2f}%")
print("="*50)
# Additional statistics
prob_df = pd.DataFrame(probability_details)
print("\nProbability Distribution Statistics:")
print(f" Mean P(correct) per prompt: {prob_df['p_correct'].mean():.4f}")
print(f" Std Dev P(correct): {prob_df['p_correct'].std():.4f}")
print(f" Min P(correct): {prob_df['p_correct'].min():.4f}")
print(f" Max P(correct): {prob_df['p_correct'].max():.4f}")
# Distribution of P(correct) values
print("\nDistribution of P(correct) by correct model count:")
for k in sorted(prob_df['correct_models'].unique()):
count = (prob_df['correct_models'] == k).sum()
subset = prob_df[prob_df['correct_models'] == k]
avg_p = subset['p_correct'].mean()
print(f" {k:2d} correct model(s): {count:3d} prompts, P(correct)={avg_p:.4f}")
# Comparison summary
print("\n" + "="*50)
print("COMPARISON WITH ORACLE ROUTING")
print("="*50)
oracle_correct = (prob_df['correct_models'] > 0).sum()
oracle_accuracy = (oracle_correct / total_prompts) * 100
print(f"Oracle Router Accuracy: {oracle_accuracy:.2f}%")
print(f"Naive Router Expected Accuracy: {expected_accuracy:.2f}%")
print(f"Gap (Oracle - Naive): {oracle_accuracy - expected_accuracy:.2f}%")
print("="*50)
if __name__ == "__main__":
csv_path = sys.argv[1] if len(sys.argv) > 1 else "new_data/routerarena_irtrouter_format.csv"
analyze_naive_routing(csv_path)