Skip to content

Commit 0dfbe3c

Browse files
authored
Merge branch 'release/0.11' into yutji/releasenote-0.11
2 parents 7080dec + e39489c commit 0dfbe3c

File tree

7 files changed

+549
-5
lines changed

7 files changed

+549
-5
lines changed

.github/workflows/build-image.yml

+2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ jobs:
6868
else
6969
echo "No Docker images found with the specified references."
7070
fi
71+
sudo docker ps -q | grep build | xargs -r sudo docker stop
72+
echo y | sudo docker system prune -a --volumes
7173
df -h
7274
- name: Prepare metadata
7375
id: metadata

dockerfile/rocm6.0.x.dockerfile

+5
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL
173173
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
174174
git apply ../megatron_deepspeed_rocm6.patch
175175

176+
# Install AMD SMI Python Library
177+
RUN apt install amd-smi-lib -y && \
178+
cd /opt/rocm/share/amd_smi && \
179+
python3 -m pip install .
180+
176181
ADD . .
177182
ENV USE_HIP_DATATYPE=1
178183
ENV USE_HIPBLAS_COMPUTETYPE=1

docs/user-tutorial/data-diagnosis.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ superbench:
8383
criteria: lambda x:x>0.05
8484
categories: KernelLaunch
8585
metrics:
86-
- kernel-launch/event_overhead:\d+
87-
- kernel-launch/wall_overhead:\d+
86+
- kernel-launch/event_time:\d+
87+
- kernel-launch/wall_time:\d+
8888
rule1:
8989
# Rule 1: If H2D_Mem_BW or D2H_Mem_BW test suffers > 5% downgrade, label it as defective
9090
function: variance

docs/user-tutorial/result-summary.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ superbench:
7070
aggregate: True
7171
categories: KernelLaunch
7272
metrics:
73-
- kernel-launch/event_overhead
74-
- kernel-launch/wall_overhead
73+
- kernel-launch/event_time
74+
- kernel-launch/wall_time
7575
nccl:
7676
statistics: mean
7777
categories: NCCL

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def run(self):
219219
'onnxruntime-gpu; python_version>="3.10"',
220220
],
221221
'nvidia': ['py3nvml>=0.2.6'],
222-
'amd': ['pyrsmi>=1.0.1'],
222+
'amd': ['amdsmi'],
223223
}
224224
),
225225
include_package_data=True,

superbench/config/amd_mi300.yaml

+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
# SuperBench Config
2+
version: v0.11
3+
superbench:
4+
enable: null
5+
var:
6+
default_local_mode: &default_local_mode
7+
enable: true
8+
modes:
9+
- name: local
10+
proc_num: 8
11+
prefix: HIP_VISIBLE_DEVICES={proc_rank}
12+
parallel: yes
13+
default_pytorch_mode: &default_pytorch_mode
14+
enable: true
15+
modes:
16+
- name: torch.distributed
17+
proc_num: 8
18+
node_num: 1
19+
frameworks:
20+
- pytorch
21+
common_model_config: &common_model_config
22+
model_ddp_parameter: &model_ddp_param
23+
duration: 0
24+
num_warmup: 128
25+
num_steps: 512
26+
sample_count: 8192
27+
batch_size: 128
28+
precision: [float32, float16]
29+
model_action: [train]
30+
pin_memory: yes
31+
num_workers: 0
32+
benchmarks:
33+
kernel-launch:
34+
<<: *default_local_mode
35+
gemm-flops:
36+
<<: *default_local_mode
37+
parameters:
38+
m: 7680
39+
n: 8192
40+
k: 8192
41+
hipblaslt-gemm:
42+
enable: true
43+
modes:
44+
- name: local
45+
proc_num: 8
46+
prefix: HIP_VISIBLE_DEVICES={proc_rank}
47+
parallel: yes
48+
parameters:
49+
in_types: ["fp32", "fp16", "bf16", 'fp8']
50+
tolerant_fail: yes
51+
num_warmup: 100
52+
num_steps: 1000
53+
shapes:
54+
- 4096,4096,4096
55+
- 8192,8192,8192
56+
- 16384,16384,16384
57+
rccl-bw:
58+
enable: true
59+
modes:
60+
- name: mpi
61+
proc_num: 8
62+
node_num: 1
63+
mca:
64+
pml: ob1
65+
btl: ^openib
66+
btl_tcp_if_exclude: lo,docker0
67+
coll_hcoll_enable: 0
68+
parameters:
69+
maxbytes: 16G
70+
ngpus: 1
71+
operation: allreduce
72+
cpu-memory-bw-latency:
73+
enable: false
74+
modes:
75+
- name: local
76+
proc_num: 1
77+
parallel: no
78+
parameters:
79+
tests:
80+
- bandwidth_matrix
81+
- latency_matrix
82+
- max_bandwidth
83+
mem-bw:
84+
enable: true
85+
modes:
86+
- name: local
87+
proc_num: 8
88+
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
89+
parallel: no
90+
ib-loopback:
91+
enable: true
92+
modes:
93+
- name: local
94+
proc_num: 16
95+
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
96+
parallel: no
97+
parameters:
98+
msg_size: 8388608
99+
disk-benchmark:
100+
enable: false
101+
modes:
102+
- name: local
103+
proc_num: 1
104+
parallel: no
105+
parameters:
106+
block_devices: []
107+
gpu-copy-bw:correctness:
108+
enable: true
109+
modes:
110+
- name: local
111+
parallel: no
112+
parameters:
113+
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
114+
copy_type: [sm, dma]
115+
size: 4096
116+
num_warm_up: 0
117+
num_loops: 1
118+
check_data: true
119+
gpu-copy-bw:perf:
120+
enable: true
121+
modes:
122+
- name: local
123+
parallel: no
124+
parameters:
125+
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
126+
copy_type: [sm, dma]
127+
ib-traffic:
128+
enable: false
129+
modes:
130+
- name: mpi
131+
proc_num: 1
132+
mca:
133+
btl: tcp,self
134+
pml: ob1
135+
btl_tcp_if_include: ens17f0
136+
gpcnet-network-test:
137+
enable: false
138+
modes:
139+
- name: mpi
140+
proc_num: 1
141+
mca:
142+
pml: ucx
143+
btl: ^uct
144+
btl_tcp_if_include: ens17f0
145+
tcp-connectivity:
146+
enable: false
147+
modes:
148+
- name: local
149+
parallel: no
150+
parameters:
151+
port: 22
152+
dist-inference:
153+
modes:
154+
- name: mpi
155+
proc_num: 8
156+
node_num: 1
157+
mca:
158+
pml: ob1
159+
btl: ^openib
160+
btl_tcp_if_exclude: lo,docker0
161+
coll_hcoll_enable: 0
162+
frameworks:
163+
- pytorch
164+
parameters:
165+
num_layers: 50
166+
num_warmup: 20
167+
num_steps: 100
168+
use_cuda_graph: true
169+
precision: float16
170+
hidden_size: 128
171+
input_size: 128
172+
batch_size: 1024
173+
model-benchmarks:gpt:
174+
enable: true
175+
<<: *default_pytorch_mode
176+
models:
177+
- gpt2-small
178+
- gpt2-large
179+
parameters:
180+
<<: *model_ddp_param
181+
precision: [float32, float16, fp8_hybrid]
182+
batch_size: 32
183+
seq_len: 224
184+
model-benchmarks:bert:
185+
enable: true
186+
<<: *default_pytorch_mode
187+
models:
188+
- bert-base
189+
- bert-large
190+
parameters:
191+
<<: *model_ddp_param
192+
precision: [float32, float16, fp8_hybrid]
193+
seq_len: 224
194+
model-benchmarks:lstm:
195+
enable: true
196+
<<: *default_pytorch_mode
197+
models:
198+
- lstm
199+
parameters:
200+
<<: *model_ddp_param
201+
batch_size: 1024
202+
input_size: 224
203+
hidden_size: 1000
204+
seq_len: 32
205+
model-benchmarks:resnet:
206+
enable: true
207+
<<: *default_pytorch_mode
208+
models:
209+
- resnet50
210+
- resnet101
211+
- resnet152
212+
parameters:
213+
<<: *model_ddp_param
214+
batch_size: 384
215+
model-benchmarks:densenet:
216+
enable: true
217+
<<: *default_pytorch_mode
218+
models:
219+
- densenet169
220+
- densenet201
221+
parameters:
222+
<<: *model_ddp_param
223+
model-benchmarks:vgg:
224+
enable: true
225+
<<: *default_pytorch_mode
226+
models:
227+
- vgg11
228+
- vgg13
229+
- vgg16
230+
- vgg19
231+
parameters:
232+
<<: *model_ddp_param

0 commit comments

Comments
 (0)