Skip to content

Commit 7e6079b

Browse files
authored
[BENCHMARK WORKFLOW] (#53)
* new workflow to generate summary of the predictions * changed filename * added benchmarker helper script * return self in operator.py * added unscaled arg to return predictions without scaling * added nanogpt model * fixed record_breakdown * format * changed var names * changed order of csv fields * modified process_results --------- Co-authored-by: John Calderon <[email protected]>
1 parent cc2f54d commit 7e6079b

File tree

10 files changed

+884
-79
lines changed

10 files changed

+884
-79
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
name: whl-build-all
2+
3+
on:
4+
workflow_dispatch:
5+
6+
env:
7+
DEVICE_PAIRS: L4,A100;L4,T4;T4,A100;L4,V100;V100,A100;T4,V100
8+
9+
jobs:
10+
create-summary:
11+
needs: [experiments-t4, experiments-l4, experiments-a100, experiments-v100]
12+
runs-on: ubuntu-latest
13+
steps:
14+
- name: Fetch repository
15+
uses: actions/checkout@v4
16+
17+
- name: create directory to gather all csv files
18+
run: |
19+
mkdir all_results
20+
21+
- name: Download All Artifacts
22+
uses: actions/download-artifact@v4
23+
with:
24+
path: all_results
25+
merge-multiple: true
26+
27+
- name: check files
28+
run: |
29+
ls -R all_results
30+
31+
- name: generate end-to-end and per operation results
32+
run: |
33+
sudo apt install python3-pip -y
34+
pip3 install pandas
35+
python3 ./experiments/process_results.py --in-dir all_results --out-e2e out_e2e --out-ops out_ops
36+
37+
- name: upload artifact
38+
uses: actions/upload-artifact@v4
39+
with:
40+
name: artifacts-combined
41+
path: |
42+
./experiments/out_e2e/
43+
./experiments/out_ops/
44+
45+
experiments-t4:
46+
runs-on: [self-hosted, dev, t4]
47+
steps:
48+
- name: Fetch repository
49+
uses: actions/checkout@v4
50+
51+
- name: run experiments
52+
run: |
53+
./experiments/benchmarker_helper_script.sh python3.10
54+
env:
55+
LOCAL_DEVICE: T4
56+
57+
- name: upload artifact
58+
uses: actions/upload-artifact@v4
59+
with:
60+
name: artifacts-t4
61+
path: ./experiments/results/
62+
63+
experiments-v100:
64+
runs-on: [self-hosted, dev, v100]
65+
steps:
66+
- name: Fetch repository
67+
uses: actions/checkout@v4
68+
69+
- name: run experiments
70+
run: |
71+
./experiments/benchmarker_helper_script.sh python3.10
72+
env:
73+
LOCAL_DEVICE: V100
74+
75+
- name: upload artifact
76+
uses: actions/upload-artifact@v4
77+
with:
78+
name: artifacts-v100
79+
path: ./experiments/results/
80+
81+
experiments-l4:
82+
runs-on: [self-hosted, dev, l4]
83+
steps:
84+
- name: Fetch repository
85+
uses: actions/checkout@v4
86+
87+
- name: run experiments
88+
run: |
89+
./experiments/benchmarker_helper_script.sh python3.10
90+
env:
91+
LOCAL_DEVICE: L4
92+
93+
- name: upload artifact
94+
uses: actions/upload-artifact@v4
95+
with:
96+
name: artifacts-l4
97+
path: ./experiments/results/
98+
99+
experiments-a100:
100+
runs-on: [self-hosted, dev, a100]
101+
steps:
102+
- name: Fetch repository
103+
uses: actions/checkout@v4
104+
105+
- name: run experiments
106+
run: |
107+
./experiments/benchmarker_helper_script.sh python3.10
108+
env:
109+
LOCAL_DEVICE: A100
110+
111+
- name: upload artifact
112+
uses: actions/upload-artifact@v4
113+
with:
114+
name: artifacts-a100
115+
path: ./experiments/results/

analyzer/habitat/analysis/arguments.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
class Arguments:
55
"""
66
Stores representations of an operation's arguments.
7+
debug_args is used for benchmarking and reporting
78
"""
8-
def __init__(self, args, kwargs):
9+
def __init__(self, args, kwargs, debug_args):
910
self.args = args
1011
self.kwargs = kwargs
12+
self.debug_args = debug_args
1113
self.special = {}
1214

1315
@classmethod
@@ -17,7 +19,8 @@ def from_raw_arguments(cls, args, kwargs):
1719
arg_name: _process_argument(arg_value)
1820
for arg_name, arg_value in kwargs.items()
1921
}
20-
return cls(processed_args, processed_kwargs)
22+
debug_args = tuple(map(_debug_process_argument,args))
23+
return cls(processed_args, processed_kwargs, debug_args)
2124

2225

2326
def _process_argument(argument):
@@ -34,3 +37,16 @@ def _process_argument(argument):
3437
return argument.size()
3538
else:
3639
return argument
40+
41+
def _debug_process_argument(argument):
42+
"""Similar to process argument, but used for reporting and debugging purposes"""
43+
if isinstance(argument, tuple):
44+
return tuple(map(_process_argument, argument))
45+
46+
if isinstance(argument, list):
47+
return list(map(_process_argument, argument))
48+
49+
if isinstance(argument, torch.Tensor):
50+
return argument.size(), argument.dtype
51+
else:
52+
return argument

analyzer/habitat/analysis/mlp/mlp.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,6 @@ def __init__(self, model_name, layers, layer_size, model_path=None):
148148
self.mu = None
149149
self.sigma = None
150150

151-
# create directory to save model
152-
curr_dir = os.getcwd()
153-
pathlib.Path(f"{curr_dir}/saved_models/{model_name}").mkdir(exist_ok=True)
154-
155151
if model_path is not None:
156152
self.load_state(model_path)
157153

analyzer/habitat/analysis/operation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
import warnings
22

33
class Operation:
44
"""
@@ -79,10 +79,10 @@ def backward(self):
7979
def device(self):
8080
return self._device
8181

82-
def to_device(self, dest_device, predictor):
82+
def to_device(self, dest_device, predictor, unscaled=False):
8383
if dest_device.name == self._device.name:
84-
return self
85-
return predictor.predict_operation(self, dest_device)
84+
warnings.warn("Predicting to the same device")
85+
return predictor.predict_operation(self, dest_device, unscaled)
8686

8787

8888
class PredictedOperation(Operation):
@@ -121,4 +121,4 @@ def device(self):
121121
def to_device(self, dest_device, predictor):
122122
raise RuntimeError(
123123
'Cannot make a prediction using a predicted operation.',
124-
)
124+
)

analyzer/habitat/analysis/predictor.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def __init__(
8989
)
9090

9191

92-
def predict_operation(self, operation, dest_device):
92+
def predict_operation(self, operation, dest_device, unscaled=False):
9393
if operation.name not in SPECIAL_OPERATIONS:
9494
return PredictedOperation(
9595
operation,
@@ -100,15 +100,15 @@ def predict_operation(self, operation, dest_device):
100100
)
101101

102102
if operation.name == 'conv2d':
103-
return self._special_scale(operation, dest_device, self._conv2d_scale)
103+
return self._special_scale(operation, dest_device, self._conv2d_scale, unscaled)
104104
elif operation.name == 'lstm':
105-
return self._special_scale(operation, dest_device, self._lstm_scale)
105+
return self._special_scale(operation, dest_device, self._lstm_scale, unscaled)
106106
elif operation.name in ['linear','__matmul__']:
107-
return self._special_scale(operation, dest_device, self._linear_scale)
107+
return self._special_scale(operation, dest_device, self._linear_scale, unscaled)
108108
elif operation.name == 'bmm':
109-
return self._special_scale(operation, dest_device, self._bmm_scale)
109+
return self._special_scale(operation, dest_device, self._bmm_scale, unscaled)
110110
elif operation.name == 'conv_transpose2d':
111-
return self._special_scale(operation, dest_device, self._conv_transpose2d_scale)
111+
return self._special_scale(operation, dest_device, self._conv_transpose2d_scale, unscaled)
112112

113113
logger.warn('Unhandled special operation: %s', operation.name)
114114
return PredictedOperation(
@@ -139,8 +139,8 @@ def _wave_scale(self, run_time, dest_device):
139139
device=dest_device,
140140
)
141141

142-
def _special_scale(self, operation, dest_device, scaler):
143-
predicted_ms = scaler(operation, dest_device)
142+
def _special_scale(self, operation, dest_device, scaler, unscaled=False):
143+
predicted_ms = scaler(operation, dest_device, unscaled)
144144

145145
if predicted_ms < 0:
146146
logger.warn(
@@ -154,10 +154,10 @@ def _special_scale(self, operation, dest_device, scaler):
154154
operation,
155155
RunTimePurePrediction(predicted_ms, dest_device),
156156
None,
157-
dest_device,
157+
dest_device
158158
)
159159

160-
def _conv2d_scale(self, operation, dest_device):
160+
def _conv2d_scale(self, operation, dest_device, unscaled=False):
161161
# 1. Merge arguments (give them all names)
162162
merged = name_all_arguments(
163163
CONV2D_PARAMS,
@@ -189,9 +189,12 @@ def _conv2d_scale(self, operation, dest_device):
189189
pred_dest = self.conv2d_pred.predict(arguments, dest_device.name)
190190
pred_orig = self.conv2d_pred.predict(arguments, operation.device.name)
191191

192+
if unscaled:
193+
return pred_dest
194+
192195
return operation.run_time_ms * pred_dest / pred_orig
193196

194-
def _conv_transpose2d_scale(self, operation, dest_device):
197+
def _conv_transpose2d_scale(self, operation, dest_device, unscaled=False):
195198
# 1. Merge arguments (give them all names)
196199
merged = name_all_arguments(
197200
CONVTRANSPOSE2D_PARAMS,
@@ -223,9 +226,12 @@ def _conv_transpose2d_scale(self, operation, dest_device):
223226
pred_dest = self.conv_transpose2d_pred.predict(arguments, dest_device.name)
224227
pred_orig = self.conv_transpose2d_pred.predict(arguments, operation.device.name)
225228

229+
if unscaled:
230+
return pred_dest
231+
226232
return operation.run_time_ms * pred_dest / pred_orig
227233

228-
def _linear_scale(self, operation, dest_device):
234+
def _linear_scale(self, operation, dest_device, unscaled=False):
229235
merged = name_all_arguments(
230236
LINEAR_PARAMS,
231237
operation.arguments.args,
@@ -259,9 +265,12 @@ def _linear_scale(self, operation, dest_device):
259265
pred_dest = self.linear_pred.predict(arguments, dest_device.name)
260266
pred_orig = self.linear_pred.predict(arguments, operation.device.name)
261267

268+
if unscaled:
269+
return pred_dest
270+
262271
return operation.run_time_ms * pred_dest / pred_orig
263272

264-
def _bmm_scale(self, operation, dest_device):
273+
def _bmm_scale(self, operation, dest_device, unscaled=False):
265274
merged = name_all_arguments(
266275
BMM_PARAMS,
267276
operation.arguments.args,
@@ -279,9 +288,12 @@ def _bmm_scale(self, operation, dest_device):
279288
pred_dest = self.bmm_pred.predict(arguments, dest_device.name)
280289
pred_orig = self.bmm_pred.predict(arguments, operation.device.name)
281290

291+
if unscaled:
292+
return pred_dest
293+
282294
return operation.run_time_ms * pred_dest / pred_orig
283295

284-
def _lstm_scale(self, operation, dest_device):
296+
def _lstm_scale(self, operation, dest_device, unscaled=False):
285297
# This is hacky, but unfortunately the only way to differentiate these
286298
# overloaded LSTM calls.
287299
has_batch_sizes = isinstance(operation.arguments.args[4], bool)
@@ -324,4 +336,7 @@ def _lstm_scale(self, operation, dest_device):
324336
pred_dest = self.lstm_pred.predict(arguments, dest_device.name)
325337
pred_orig = self.lstm_pred.predict(arguments, operation.device.name)
326338

339+
if unscaled:
340+
return pred_dest
341+
327342
return operation.run_time_ms * pred_dest / pred_orig
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
PYTHON_VERSION=$1
4+
VENV_NAME=venv_${PYTHON_VERSION}
5+
6+
python3 -m virtualenv ${VENV_NAME} -p ${PYTHON_VERSION}
7+
ln -s /usr/bin/${PYTHON_VERSION}-config ${VENV_NAME}/bin/python3-config
8+
9+
source $VENV_NAME/bin/activate
10+
11+
rm -r cpp/build analyzer/habitat/*.so
12+
git submodule update --init --recursive
13+
git lfs pull
14+
15+
pushd analyzer
16+
./install-dev.sh
17+
popd
18+
19+
pushd experiments
20+
21+
device_pairs_list=()
22+
IFS=';' read -ra input_devices <<< "${DEVICE_PAIRS}"
23+
24+
for i in "${input_devices[@]}"; do
25+
IFS=',' read -ra pair <<< "${i}"
26+
orig=${pair[0]}
27+
dest=${pair[1]}
28+
device_pairs_list+=("${orig},${dest}" "${dest},${orig}")
29+
done
30+
31+
for j in "${device_pairs_list[@]}"; do
32+
IFS=',' read -ra pair <<< "${j}"
33+
orig=${pair[0]}
34+
dest=${pair[1]}
35+
if [ ${orig} == ${LOCAL_DEVICE} ]; then
36+
python model_eval_per_device.py ${orig} ${dest}
37+
fi
38+
done
39+
40+
popd
41+
42+
pushd analyzer/habitat/data
43+
find -iname "model.pth" | xargs sha256sum
44+
45+
popd
46+
47+
deactivate
48+
rm -r ${VENV_NAME}

0 commit comments

Comments
 (0)