forked from SearchScale/vectorsearch-benchmarks
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate-combinations.py
More file actions
199 lines (169 loc) · 10.5 KB
/
Copy pathgenerate-combinations.py
File metadata and controls
199 lines (169 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import itertools
import argparse
import sys
import json
import hashlib
import os
import shutil
parser = argparse.ArgumentParser(description='Generate combinations script')
parser.add_argument('--data-dir', required=True, help='Data directory path')
parser.add_argument('--datasets', required=True, help='Datasets JSON file path')
parser.add_argument('--sweeps', required=True, help='Sweeps JSON file path')
parser.add_argument('--configs-dir', required=True, help='Configs output directory path')
args = parser.parse_args()
print("Arguments captured:")
print(f"data-dir: {args.data_dir}")
print(f"datasets: {args.datasets}")
print(f"sweeps: {args.sweeps}")
print(f"configs-dir: {args.configs_dir}")
print("----------------------")
sweeps = json.load(open(args.sweeps))
datasets = json.load(open(args.datasets))
# Clean configs directory
if os.path.exists(args.configs_dir):
shutil.rmtree(args.configs_dir)
for sweep in sweeps:
# Get dataset information for this sweep
dataset_name = sweeps[sweep]["dataset"]
dataset_info = datasets["datasets"][dataset_name]
variants={}
invariants={}
# Add dataset-specific parameters to invariants
invariants["datasetFile"] = f"{args.data_dir}/{dataset_name}/{dataset_info['base_file']}"
invariants["queryFile"] = f"{args.data_dir}/{dataset_name}/{dataset_info['query_file']}"
invariants["groundTruthFile"] = f"{args.data_dir}/{dataset_name}/{dataset_info['ground_truth_file']}"
invariants["vectorDimension"] = dataset_info["vector_dimension"]
print("sweep: " + sweep)
for param, value in sweeps[sweep].get("common-params", {}).items():
if not isinstance(value, list):
invariants[param] = value
else:
variants[param] = value
for algo in sweeps[sweep].get("algorithms", []):
algorithms = sweeps[sweep].get("algorithms", [])
algo_variants = variants.copy()
algo_invariants = invariants.copy()
algo_invariants["algoToRun"] = algo
for param, value in algorithms[algo].items():
if param not in ["params"]:
if not isinstance(value, list):
algo_invariants[param] = value
else:
algo_variants[param] = value
# Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
# name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
if algo_variants:
# Separate efSearch from other variants if it exists
efSearch_values = None
other_variant_keys = []
other_variant_values = []
for key, value in algo_variants.items():
if key == 'efSearch':
efSearch_values = value
else:
other_variant_keys.append(key)
other_variant_values.append(value)
# Generate combinations with efSearch at the beginning (innermost loop)
if efSearch_values and other_variant_keys:
# Generate combinations of other parameters first
for other_combination in itertools.product(*other_variant_values):
other_variants = dict(zip(other_variant_keys, other_combination))
# Then iterate through efSearch values
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = other_variants.copy()
current_variants['efSearch'] = ef_value
# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue
# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue
# Generate hash only from other_variants (excluding efSearch)
base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"
config = algo_invariants.copy()
config.update(current_variants)
# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True
# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False
# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
elif efSearch_values:
# Only efSearch values, no other variants
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = {'efSearch': ef_value}
# Generate hash from empty dict since no other variants exist
base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"
config = algo_invariants.copy()
config.update(current_variants)
# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True
# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False
# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No efSearch, use original logic
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))
# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue
# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue
hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]
config = algo_invariants.copy()
config.update(current_variants)
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
print("----------------------")