-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: remove old patches around reactant bug (#1135)
- Loading branch information
Showing
2 changed files
with
1 addition
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "Lux" | ||
uuid = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.4.1" | ||
version = "1.4.2" | ||
|
||
[deps] | ||
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1 @@ | ||
# For some reason xlogx and xlogy with boolean inputs leads to incorrect results sometimes | ||
# XXX: Once https://github.com/EnzymeAD/Reactant.jl/pull/278 is merged and tagged | ||
LuxOps.xlogx(x::TracedRNumber{Bool}) = zero(x) | ||
|
||
function LuxOps.xlogy(x::TracedRNumber, y::TracedRNumber) | ||
return invoke(LuxOps.xlogy, Tuple{Number, Number}, float(x), float(y)) | ||
end |
d962073
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
d962073
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/121434
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
d962073
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3833
ns4041
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4250
ns5209
ns0.82
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4666
ns5333
ns0.87
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4041.5
ns3937.5
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10459
ns10250
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10417
ns11083
ns0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10083
ns11375
ns0.89
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10625
ns10542
ns1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1125
ns1167
ns0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1375
ns1292
ns1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1375
ns1416
ns0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1208
ns1167
ns1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
3958
ns4020.5
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4125
ns4250
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4208
ns4000
ns1.05
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3958
ns4166
ns0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57917
ns70208
ns0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46459
ns58667
ns0.79
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46750
ns64125
ns0.73
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82708
ns79750
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2047958
ns2033104
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2090000
ns2103708
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2093917
ns2094916
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1976812.5
ns2002834
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
146708
ns184125
ns0.80
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
182667
ns189792
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
145833
ns186063
ns0.78
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
143583
ns185125
ns0.78
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1151625.5
ns1118896
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1117646
ns1163979
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1124084
ns1120500
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1165146
ns1129854
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3500
ns3375
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4083
ns3917
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4042
ns5041
ns0.80
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3916
ns3333.5
ns1.17
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9083
ns9166
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9166
ns9125
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9125
ns9042
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8854.5
ns8625
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17334
ns19084
ns0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18542
ns15375
ns1.21
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
17834
ns18375
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16333
ns14625
ns1.12
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214916.5
ns225917
ns0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
214541
ns214542
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213500
ns215125
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
220667
ns213000
ns1.04
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns500
ns1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
625
ns791
ns0.79
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
583
ns750
ns0.78
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
625
ns541
ns1.16
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1458
ns1417
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1750
ns1542
ns1.13
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1458
ns1833
ns0.80
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1625
ns1667
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
6208
ns8917
ns0.70
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5958
ns6417
ns0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6000
ns8042
ns0.75
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10208
ns10334
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221042
ns233625
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228959
ns230375
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229375
ns230166
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
223854.5
ns225083
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3875
ns3875
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3958
ns4000
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3958
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3875
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16708
ns17333
ns0.96
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
17083
ns17125
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16875
ns18416
ns0.92
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16584
ns16542
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
570250
ns602459
ns0.95
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
577041
ns612791
ns0.94
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
576958
ns611250
ns0.94
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
573916
ns609583
ns0.94
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1424354
ns1422458
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1421125
ns1432875
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1417666
ns1432708.5
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1422417
ns1421250
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1082874.5
ns1073292
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
969583.5
ns969125
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1345833
ns1355229
ns0.99
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1275270.5
ns1303542
ns0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
5772500
ns5773875
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4552375
ns4524834
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4981312.5
ns4956520.5
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5767584
ns5616459
ns1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
541
ns542
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
541
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2125
ns2208
ns0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2208
ns2167
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2250
ns2167
ns1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2125
ns2084
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4167
ns4250
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4375
ns4125
ns1.06
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4875
ns4708
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4500
ns3833
ns1.17
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11291
ns11375
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11292
ns11750
ns0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12000
ns11875
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11375
ns11292
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6458
ns6292
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6833
ns6750
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8000
ns7166
ns1.12
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6875
ns6333
ns1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17083
ns18312.5
ns0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
19250
ns18083
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17791.5
ns19833
ns0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17875
ns18125
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
625
ns584
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns541
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8792
ns8959
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
8875
ns8917
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
8916.5
ns9083
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8209
ns8542
ns0.96
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64500
ns96500
ns0.67
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64583
ns96458
ns0.67
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64250
ns96666.5
ns0.66
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64750
ns96458
ns0.67
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
285625
ns282542
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
283375
ns294792
ns0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
276208.5
ns278250
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
297500
ns274042
ns1.09
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3402333
ns3410792
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
3060583
ns2893584
ns1.06
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3019687.5
ns3043771
ns0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
4056229
ns3950938
ns1.03
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7721750
ns7640458
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7459709
ns7363916.5
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7439375.5
ns7444583
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8277625
ns8213291
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17593999.5
ns17504417
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
17466354
ns17685667
ns0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
17549604.5
ns17570042
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
9302166.5
ns14113396
ns0.66
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23554916.5
ns23914500
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
33592458
ns43551541
ns0.77
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37227500
ns37461209
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35248104
ns34611021
ns1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
188482416
ns313175916
ns0.60
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
164033541
ns178521083
ns0.92
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
153090042
ns195096687.5
ns0.78
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
443063541
ns279780167
ns1.58
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
290580729
ns273572625
ns1.06
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
257093729.5
ns278931729
ns0.92
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
296199833.5
ns256343958
ns1.16
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
482390645.5
ns474930271
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
22750
ns21875
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24645.5
ns22459
ns1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23792
ns23250
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21958
ns21334
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
103459
ns111375
ns0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
104709
ns104624.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
103916.5
ns104666
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
103729.5
ns103604.5
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5834
ns5833.5
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6083
ns6041
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6625
ns6667
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6209
ns5875
ns1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14667
ns14834
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15020.5
ns15792
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16020.5
ns16375
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15250
ns14834
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3027500
ns3078645.5
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2071021
ns2149083
ns0.96
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2285333.5
ns2304458.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4820958
ns4677166
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23646313
ns23611208
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18048395.5
ns18335958
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
16906125
ns17863458.5
ns0.95
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35430208
ns35453375
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33437292
ns33321333
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27650521
ns27967958
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27492875
ns27533500
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
42564979.5
ns41461333
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
72854.5
ns72791
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
73458
ns73083
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
74021
ns81187.5
ns0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
75000
ns73875
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
303958
ns316333.5
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
219312.5
ns318437.5
ns0.69
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
219042
ns323125
ns0.68
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
319666.5
ns308937.5
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11500
ns11625
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
11959
ns12083
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
12416
ns12125
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12208
ns11959
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26083.5
ns26834
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26104.5
ns26959
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27209
ns27958
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26750
ns26791.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12166.5
ns12625
ns0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12645.5
ns12604.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13500
ns13958
ns0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12875
ns12208
ns1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
25750
ns25958
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26459
ns26333
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26375
ns26583
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
25833
ns26541
ns0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
182125
ns179625
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
180500
ns179458
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
183000
ns183083
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
180375
ns188958
ns0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
581750
ns595770.5
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
590708.5
ns595666
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
609500
ns584792
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
594250
ns582042
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5625
ns5959
ns0.94
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5958
ns6375
ns0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6500
ns7125
ns0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6250
ns6042
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13917
ns14166
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13916
ns14917
ns0.93
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14583
ns15625
ns0.93
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14291
ns14458
ns0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1196250
ns1239500
ns0.97
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1251708
ns1321583
ns0.95
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1274542
ns1360666.5
ns0.94
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1013000
ns1089687
ns0.93
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4142875
ns4119041
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4864958
ns4588250
ns1.06
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4545520.5
ns4571375
ns0.99
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
3911541.5
ns3710875
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1792
ns1833
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1875
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1875
ns1833
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1875
ns1834
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4834
ns4959
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4917
ns4916
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5000
ns4875
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4916
ns4917
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5250
ns5792
ns0.91
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5917
ns6167
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6333
ns7042
ns0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6042
ns5875
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11000
ns11792
ns0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11458
ns11125
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11292
ns11250
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
11000
ns10584
ns1.04
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns334
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
334
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2708
ns3000
ns0.90
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
3041
ns3083
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2792
ns3041
ns0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2709
ns2625
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
11167
ns11875
ns0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11667
ns11833
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
12375
ns13042
ns0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
12083
ns11292
ns1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25083
ns24959
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25416
ns24979.5
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25167
ns27250
ns0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24583
ns24458
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4208
ns4250
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4250
ns4291
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4250
ns4250
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4250
ns4166
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16375
ns16500
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16417
ns16166
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16250
ns16541
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16042
ns16250
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5750
ns5791
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5791
ns5750
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5875
ns5834
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5833
ns5791
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20375
ns20792
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
20479.5
ns20959
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21208
ns21459
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20854.5
ns20542
ns1.02
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
427021
ns412875
ns1.03
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
388041
ns375208
ns1.03
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
475333
ns487209
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
107750
ns146584
ns0.74
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
885834
ns916708.5
ns0.97
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
960667
ns989792
ns0.97
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1182208
ns1196125
ns0.99
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
375875
ns476875
ns0.79
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80125
ns135084
ns0.59
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
80750
ns81542
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
82167
ns141833
ns0.58
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80791
ns135750
ns0.60
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1942937
ns1911291.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1918166.5
ns1946333
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1916333
ns1928333
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1923604
ns1910834
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
333
ns333
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1833
ns1875
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1833
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1833
ns1792
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6167
ns6625
ns0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6792
ns6792
ns1
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7333
ns7792
ns0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6667
ns6666
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8791.5
ns9667
ns0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9416
ns9291
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9292
ns9333
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9167
ns9417
ns0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
119015458
ns111820937.5
ns1.06
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
173560375
ns181915979
ns0.95
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148104416
ns143480208
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
104510604
ns92143250
ns1.13
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
611899646
ns614702333
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
555362500
ns582318312.5
ns0.95
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
453017291
ns456793479.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
632276917
ns623509562.5
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
666765667
ns796858958
ns0.84
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
666371104
ns687543333
ns0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
582119812.5
ns619636833
ns0.94
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
866159459
ns745741417
ns1.16
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57541
ns62834
ns0.92
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47708
ns47791
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46875
ns53250
ns0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84375
ns83083
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1944250
ns1923354
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1980416
ns1992584
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1976042
ns1986708.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1906083
ns1895062.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
267917
ns266916.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
268292
ns267354.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
267937.5
ns268666
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267625
ns264979
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
703792
ns664125
ns1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
681124.5
ns694604.5
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
595667
ns650292
ns0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
697208
ns699958
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2209437.5
ns2256583
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2173708
ns2246021
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2200062
ns2238750
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2113875
ns2261771
ns0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5503083
ns5510583
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5488667
ns5590125
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5509792
ns5513333
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5568042
ns5481479.5
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
638000
ns669750
ns0.95
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
645667
ns680333
ns0.95
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
647187.5
ns678166
ns0.95
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
644709
ns674417
ns0.96
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1827583
ns1816770.5
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1720833
ns1665417
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1720291
ns1717645.5
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2097125
ns2082542
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
59166
ns70125
ns0.84
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47625
ns59875
ns0.80
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
45833
ns52958
ns0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84209
ns82666
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2051584
ns2037917
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2075395.5
ns2108146
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2040667
ns2092292
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2021583
ns2001334
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13373292
ns13460541.5
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12436750
ns12543854
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12559270.5
ns12654167
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
14986208.5
ns15261812.5
ns0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47390625
ns47280959
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
41705020.5
ns42008521
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
40992438
ns40839333.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58725208
ns58419750
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
73938270.5
ns97048750
ns0.76
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
90830563
ns91157167
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90514083
ns90856333.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
76122334
ns76444354
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
59916
ns72334
ns0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47541
ns47292
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47458
ns65375
ns0.73
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83500
ns82584
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1948584
ns1929937
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1954250
ns1984583.5
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1965437.5
ns1983584
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1888625
ns1888750
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
333
ns417
ns0.80
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
5979.5
ns6541
ns0.91
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6584
ns6458
ns1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6500
ns6583
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6187.5
ns5958
ns1.04
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2666
ns2917
ns0.91
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2875
ns2834
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2792
ns2875
ns0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2666
ns2625
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
286733687.5
ns279890375
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
339568833
ns347812250
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
314522187.5
ns310658166.5
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
270045166
ns261239625
ns1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1015582292
ns994066791
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
953582875
ns960267958
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
840575375
ns837209229.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1282644084
ns1129871667
ns1.14
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1419694479.5
ns1752205958
ns0.81
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1672572375
ns1693119292
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1620047667
ns1650193041
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1358918958.5
ns1306363020.5
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1454458
ns1458375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1408583
ns1463959
ns0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1410041.5
ns1465625
ns0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1442292
ns1459625
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5055625
ns5012416
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5019625
ns5066791
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5009458
ns5033750
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5053667
ns5030375
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
171675979
ns158175666
ns1.09
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
126429812.5
ns166759458.5
ns0.76
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
106760875
ns90721479
ns1.18
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
165741833.5
ns151859250
ns1.09
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
622640208
ns669929250
ns0.93
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
492172500
ns560789291
ns0.88
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
462809167
ns487588708
ns0.95
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
660164833
ns651112083
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
8982250
ns8927708.5
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8969792
ns9111000
ns0.98
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7891125
ns7978437.5
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
9977959
ns10091416
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
36106959
ns36693146
ns0.98
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
37109917
ns39523229
ns0.94
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33736459
ns34135874.5
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
39159896
ns59280958
ns0.66
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47375
ns47437.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47500
ns47500
ns1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47645.5
ns47542
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47500
ns47292
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50417
ns50417
ns1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50875
ns50458
ns1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
51729
ns50708
ns1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50333
ns50333
ns1
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6583
ns7145.5
ns0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7208
ns7292
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7646
ns8084
ns0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7333
ns6667
ns1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9292
ns10417
ns0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10209
ns10166
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10333
ns10250
ns1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10167
ns9833
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5854.5
ns6333
ns0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6292
ns6375
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
6834
ns7479.5
ns0.91
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6166
ns5166
ns1.19
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12667
ns13709
ns0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13208.5
ns13000
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13459
ns13542
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12958
ns13416.5
ns0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1000
ns1125
ns0.89
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1042
ns1041
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1084
ns1083
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1042
ns1083
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7770.5
ns8333
ns0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8125
ns8084
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7834
ns8042
ns0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8250
ns7875
ns1.05
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23417
ns23791.5
ns0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23375
ns23209
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23500
ns23291
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23458
ns23041
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52292
ns52583
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52667
ns52625
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
52667
ns52833
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52417
ns52417
ns1
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1448145.5
ns1458625
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1457021
ns1464021
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1402542
ns1466000
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1403042
ns1454708
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5036750
ns5020749.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5020979
ns5048500
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5021708
ns5032583
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5042708.5
ns5015271
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3054459
ns3133854.5
ns0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2092750
ns2152167
ns0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2302708.5
ns2319584
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4935833
ns4994354
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24359708.5
ns24444667
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18879875
ns19072896
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17805083
ns19040875
ns0.94
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36477083
ns36840083
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
34112104.5
ns34088208
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28352833
ns28581417
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27995625
ns28009625
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
42341709
ns41680458.5
ns1.02
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
143179166
ns141268000
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
147785458
ns143350625
ns1.03
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
126873458.5
ns120743271
ns1.05
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
172641167
ns188129709
ns0.92
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
1416291312.5
ns2324854792
ns0.61
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1304509479
ns841095084
ns1.55
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1238526750
ns1147318167
ns1.08
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
685736000
ns833862645.5
ns0.82
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76042
ns84125
ns0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
79459
ns78250
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
76687
ns76312
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
75124.5
ns71667
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
189375
ns290458
ns0.65
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
278000
ns292000
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
289166.5
ns305208
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
193709
ns288208.5
ns0.67
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35548875.5
ns35368791
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
36247291.5
ns36524083.5
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32430687.5
ns31361542
ns1.03
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40776042
ns38859354
ns1.05
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
148827666
ns148171584
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
152471625
ns157709333
ns0.97
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
135828541
ns137631188
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
224259958
ns150161812.5
ns1.49
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120283062
ns111509000
ns1.08
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
173757375
ns181918104.5
ns0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148381833
ns143432542
ns1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
100995854
ns94189375.5
ns1.07
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
468476625
ns497837834
ns0.94
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
466581667
ns512628166
ns0.91
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
438033125
ns440382167
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
758068771
ns678623500
ns1.12
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
656498666
ns644936208
ns1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
639464917
ns676380021
ns0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
572772729.5
ns603539166.5
ns0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
867522166
ns727707084
ns1.19
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1241166.5
ns1357667
ns0.91
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
960584
ns795375
ns1.21
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
985604
ns995750
ns0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2040750
ns2104875
ns0.97
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
3033584
ns2829624.5
ns1.07
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2618542
ns2513417
ns1.04
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2633875
ns2616854
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3767750
ns3785792
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
5830292
ns5815812.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
5796375
ns5906250
ns0.98
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
5804458
ns5802125
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
2978917
ns2884250
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7500
ns8084
ns0.93
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6042
ns6333
ns0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6209
ns7042
ns0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10333
ns10541
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212708
ns213645.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220542
ns255312.5
ns0.86
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223542
ns220667
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
208708
ns205667
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
297468334
ns293659209
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
215016959
ns259757583
ns0.83
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
193569000
ns158085937.5
ns1.22
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
311798792
ns293331625
ns1.06
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1238998917
ns1087845916.5
ns1.14
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
901957166.5
ns950749875
ns0.95
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
825878542
ns812442750
ns1.02
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1319998292
ns1143172250
ns1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5542
ns5542
ns1
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5834
ns6084
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6708
ns7020.5
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5375
ns4958
ns1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7083
ns7292
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7333
ns7542
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7875
ns7541
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7042
ns7042
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
583
ns583
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
625
ns500
ns1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
541
ns500
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9083
ns9167
ns0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
8666
ns9250
ns0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9292
ns9416
ns0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
8583
ns9084
ns0.94
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
351792
ns380958
ns0.92
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
351708
ns352792
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
352375
ns352625
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
354000
ns350834
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
827667
ns832062.5
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
779562.5
ns827458
ns0.94
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
778208
ns775062.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
824354.5
ns823188
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
337833
ns335250
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
342521
ns327208
ns1.05
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
452875
ns451729
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
11687.5
ns12458
ns0.94
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
713208.5
ns711291
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
736500
ns735541
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1010250
ns1004041
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
27208.5
ns26666
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
381792
ns375354.5
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
354187
ns336667
ns1.05
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
441708
ns439084
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
31083
ns28875
ns1.08
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
731646
ns720625
ns1.02
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
785667
ns804333.5
ns0.98
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1027917
ns1027667
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
91083
ns104125
ns0.87
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3542
ns3500
ns1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3458
ns3833
ns0.90
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3583
ns3750
ns0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3542
ns3334
ns1.06
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4167
ns4208
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4250
ns4375
ns0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4500
ns4583
ns0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4208
ns4375
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3375
ns3625
ns0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3917
ns3917
ns1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4084
ns4667
ns0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3917
ns3729
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8375
ns8479.5
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8167
ns8750
ns0.93
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8584
ns8584
ns1
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8500
ns8500
ns1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204791
ns206959
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
210875
ns212916
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
211541
ns214834
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
202083
ns200708
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
600417
ns649583.5
ns0.92
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
627875
ns623333
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
630312
ns622250
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
583542
ns613479
ns0.95
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
1010270.5
ns1236916
ns0.82
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
1015521
ns1300167
ns0.78
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
949979.5
ns1184250
ns0.80
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
909416
ns1155667
ns0.79
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4557687.5
ns4569500
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4722959
ns4789500
ns0.99
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4470333.5
ns4471334
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
4443646.5
ns4277000
ns1.04
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3334
ns3500
ns0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3500
ns3708
ns0.94
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4125
ns4500
ns0.92
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3625
ns3084
ns1.18
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7292
ns7542
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7167
ns7500
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7167
ns7458
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7458.5
ns6750
ns1.10
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1562000
ns1661083
ns0.94
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1179000
ns1212459
ns0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1346417
ns1388375
ns0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2481104
ns2367291.5
ns1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12361833
ns12379333
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9575979
ns9634187.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9245041
ns9303250.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18149645.5
ns17994791.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17389625
ns17400125
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14446583
ns14391542
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14298208.5
ns14366500
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21068500
ns20976166.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
88500
ns134083
ns0.66
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
99167
ns134145.5
ns0.74
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
91917
ns140125
ns0.66
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
90708.5
ns133834
ns0.68
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2074916
ns2067833
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2029541
ns2021792
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1761250
ns2040375
ns0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2035041.5
ns2038229.5
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
2084
ns1250
ns1.67
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
2666
ns1542
ns1.73
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
3583.5
ns3500
ns1.02
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
1916
ns1041.5
ns1.84
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2625
ns2792
ns0.94
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2875
ns2791
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
2917
ns2834
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2834
ns2687.5
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7375
ns8084
ns0.91
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6042
ns6416
ns0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6083
ns6916
ns0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10083
ns10583
ns0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212333.5
ns224958
ns0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220563
ns230000
ns0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223084
ns220875
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
208417
ns206709
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3750
ns3708
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3750
ns3750
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3709
ns3667
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3750
ns3667
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14709
ns14625
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14625
ns14250
ns1.03
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14541
ns14625
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14292
ns14458
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
94500
ns145791
ns0.65
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
93916.5
ns141583
ns0.66
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
96125
ns142459
ns0.67
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
95625
ns141375.5
ns0.68
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1950959
ns1928792
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1918895.5
ns1919959
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1651334
ns1933062.5
ns0.85
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1942375
ns1928146
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
881833
ns870875
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
830792
ns819625
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1225417
ns1235083
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
944312.5
ns966104.5
ns0.98
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2742708
ns2825084
ns0.97
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2522750
ns2525875
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3329959
ns3358499.5
ns0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3361458
ns3396917
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
15166.5
ns14750
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17000
ns15375
ns1.11
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
16583
ns16875
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
15667
ns14958
ns1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214666
ns261458
ns0.82
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
224541.5
ns259875
ns0.86
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
216208
ns216042
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
217645.5
ns220500
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
219500
ns219729.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
220000
ns220479
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
221167
ns223250
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
220834
ns221354.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
495958
ns510541.5
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
507958
ns506917
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
498625
ns498667
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
506541
ns512312.5
ns0.99
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
4166.5
ns3667
ns1.14
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
4312.5
ns4833
ns0.89
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
4583
ns5167
ns0.89
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
4625
ns3979.5
ns1.16
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
7187.5
ns7625
ns0.94
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
7292
ns7375
ns0.99
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
7229.5
ns7333
ns0.99
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
7625
ns7250
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17417
ns18792
ns0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
19292
ns17542
ns1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18625
ns19791
ns0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18500
ns18125
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
219083.5
ns252708
ns0.87
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
211959
ns213500
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213521
ns214541
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213208
ns214000
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4250
ns4229.5
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4334
ns4916
ns0.88
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4750
ns5417
ns0.88
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4375
ns4291.5
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10417
ns10750
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10750
ns11042
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10500
ns10875
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10500
ns10042
ns1.05
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
2958
ns3292
ns0.90
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3417
ns3625
ns0.94
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
3959
ns4167
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3542
ns3459
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7291
ns7750
ns0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7458
ns7791
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7583
ns7666
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7625
ns7292
ns1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23616833
ns23600875
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34076542
ns43903313
ns0.78
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37648750
ns37710791.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35355896
ns34490521
ns1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
185118750
ns191551625
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
161569416
ns186643917
ns0.87
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
146021041.5
ns145792667
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
274915208
ns271888584
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
273527291
ns292672562
ns0.93
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
244066854
ns266647854
ns0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
231262500
ns299377291.5
ns0.77
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
325681645.5
ns325821396
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
183916.5
ns184041
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
184479.5
ns182292
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
183709
ns184917
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
185125
ns183667
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
635250
ns632125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
590375
ns596250
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
586375
ns589146
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
586875.5
ns634646
ns0.92
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3912854
ns3923584
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3922688
ns4065250
ns0.96
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3534875
ns3605250
ns0.98
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
4683208
ns4910271
ns0.95
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17461333
ns16427166.5
ns1.06
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17877604
ns17546270.5
ns1.02
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16535333
ns15424750
ns1.07
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
20876542
ns41363334
ns0.50
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
500
ns583
ns0.86
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns583
ns1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
583
ns542
ns1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
541
ns625
ns0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8875
ns9500
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9458
ns9500
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9167
ns9792
ns0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9084
ns9541
ns0.95
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
653952292
ns513820542
ns1.27
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
393857103.5
ns535432083
ns0.74
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
328714250
ns355647999.5
ns0.92
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
759532875
ns672007125
ns1.13
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1886540417
ns1968156417
ns0.96
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1638767625
ns1778975000
ns0.92
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1505416479
ns1508167229
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2232982666.5
ns2144133562.5
ns1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1645500
ns1659562.5
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1196083
ns1222625
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1372166
ns1402292
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2490500
ns2420750
ns1.03
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12742021.5
ns12714958
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9937333.5
ns10033625
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9670291
ns9669250
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18551458
ns18444395.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17729729
ns17720021
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14747250
ns14836625
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14539958
ns14593959
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21491875
ns21470916.5
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26250
ns26208
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26292
ns26291
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26250
ns26208
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67416
ns67292
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
67167
ns67166
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
68042
ns67437.5
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66917
ns67125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
203916
ns206334
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209500
ns212084
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
208375
ns211708
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199583
ns200042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
615979
ns652229
ns0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
622458.5
ns673167
ns0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
625042
ns623750.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
628771
ns594625
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
654750
ns689375
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
648792
ns686646
ns0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
639250
ns603125.5
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
553000
ns595854
ns0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2255292
ns2275292
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2216833.5
ns2318250
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2230625
ns2234167
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2261625
ns2258041
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17479.5
ns17208
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18166
ns16708.5
ns1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18334
ns18542
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18542
ns26209
ns0.71
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
230250
ns233041
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
218666.5
ns238708
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220145.5
ns220895.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
225083.5
ns247479
ns0.91
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
542
ns625
ns0.87
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
541
ns583
ns0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9625
ns9917
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9500
ns9916.5
ns0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9583
ns10166
ns0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9583
ns9709
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5166
ns5375
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5667
ns5667
ns1
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6291.5
ns7208
ns0.87
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5625
ns5166
ns1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6959
ns7750
ns0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7709
ns7417
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7125
ns7750
ns0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7250
ns7208
ns1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2292
ns2459
ns0.93
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2125
ns2375
ns0.89
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2333
ns2250
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2167
ns2042
ns1.06
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6354.5
ns6667
ns0.95
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6500
ns6625
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6583.5
ns6667
ns0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6459
ns6500
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
748750
ns781188
ns0.96
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
746708
ns762250
ns0.98
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
749375
ns746542
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
749125
ns746084
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
794125
ns815833
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
775500
ns816958.5
ns0.95
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
775812.5
ns775937.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
794500.5
ns810604.5
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7458
ns8042
ns0.93
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6084
ns6417
ns0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5583
ns6958
ns0.80
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10541
ns10625
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
231542
ns265000
ns0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
231875
ns268728.5
ns0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229604
ns229125
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215187.5
ns217229
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10166.5
ns10375
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10416
ns10646
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10479
ns11292
ns0.93
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10417
ns10125
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25083.5
ns24542
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
23916
ns25354.5
ns0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24625
ns25500
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
25000
ns24333
ns1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106424375
ns106479791.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
117279208.5
ns126041750
ns0.93
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120424354
ns120943833
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117916208
ns117512916.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
397131541.5
ns384219250
ns1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
366183958
ns372791166.5
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
355277020.5
ns338002625
ns1.05
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
545563875.5
ns471273750
ns1.16
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
609770291
ns803612958.5
ns0.76
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
756955334
ns771462084
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
745569813
ns812264500
ns0.92
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
607706416.5
ns607987313
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6875
ns7042
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
9229
ns7208
ns1.28
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8833
ns8166.5
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7500
ns6750
ns1.11
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14375
ns14333
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13750
ns14750
ns0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14667
ns14000
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13542
ns13792
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5959
ns6209
ns0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6354.5
ns6417
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7083
ns7500
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6042
ns5792
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12666
ns12875
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12917
ns12583
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12916
ns13125
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12292
ns12333
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
5875
ns5042
ns1.17
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
5937.5
ns5625
ns1.06
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
5812.5
ns6250
ns0.93
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
6000
ns5958
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
15375
ns15666
ns0.98
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
18229.5
ns15709
ns1.16
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
15625
ns15583
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
15834
ns15458
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
334
ns417
ns0.80
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
417
ns375
ns1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
416
ns333
ns1.25
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6291
ns6542
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6541
ns6542
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6375
ns6542
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6042
ns6208
ns0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5958
ns5875
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5917
ns5916
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6083
ns5833
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5833
ns5834
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
20895.5
ns22729
ns0.92
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21084
ns21625
ns0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21334
ns21667
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
20875
ns20854.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
145167
ns192437
ns0.75
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
145333
ns194875
ns0.75
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
147791
ns190958
ns0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
146250.5
ns198042
ns0.74
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1351583
ns1364250
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1324833.5
ns1373333.5
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1269708
ns1330458
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1342020.5
ns1326229.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24854
ns23125
ns1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24750
ns23000
ns1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
24083.5
ns24041
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
23041.5
ns21667
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
130333
ns131208
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
131875
ns183125.5
ns0.72
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
120583
ns118667
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
127250
ns180917
ns0.70
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
333
ns375
ns0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
333
ns375
ns0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns333
ns0.88
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6375
ns6833
ns0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6750
ns6667
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6167
ns6833
ns0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6166
ns6417
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4250
ns4542
ns0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4583
ns5229.5
ns0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5000
ns5125
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4666
ns4666
ns1
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9917
ns10334
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10000
ns10625
ns0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10458
ns10375
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10250
ns10375
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1584
ns1625
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1584
ns1625
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5625
ns6042
ns0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6000
ns6000
ns1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5792
ns5959
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5666
ns5625
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6809750
ns6837750
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6375834
ns6418708
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6505250
ns6547416.5
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7653125.5
ns7628667
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24098271
ns24126020.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21313750
ns21396208
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21034292
ns20992000
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29936333.5
ns29707541
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
37354916.5
ns48614958
ns0.77
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45524125
ns45739708
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45728625
ns45440458
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
38256604.5
ns38260167
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5708
ns5917
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5916
ns6083
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6542
ns7041
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5958
ns5708
ns1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8792
ns8583
ns1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8375
ns8959
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8792
ns8417
ns1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8042
ns8125
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1544521
ns1564625
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1274291.5
ns1276958
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1619792
ns1632792
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2113874.5
ns2147187.5
ns0.98
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7917042
ns7938667
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6631541
ns6675417
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7090646
ns7179229.5
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10525708
ns10466792
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
363667
ns375979.5
ns0.97
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
373917
ns356791.5
ns1.05
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
456000
ns453958
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
24312
ns31791.5
ns0.76
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
737791.5
ns724250
ns1.02
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
796895.5
ns820708
ns0.97
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1063396
ns1064167
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
91145.5
ns93125
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397459
ns413500
ns0.96
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
287666
ns220417
ns1.31
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
287958
ns305958
ns0.94
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
751208
ns758417
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
667375
ns664291
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
532500
ns464750
ns1.15
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
533459
ns524625
ns1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
974250
ns971875
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
677250
ns660125
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
646333
ns688833
ns0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
555812.5
ns599208.5
ns0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
589334
ns676041
ns0.87
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2506042
ns2465396
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2452187.5
ns2549750
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2421083
ns2454750
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2509083.5
ns2436396
ns1.03
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
3042
ns2084
ns1.46
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
3500
ns2500
ns1.40
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
3709
ns4584
ns0.81
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
2834
ns2000
ns1.42
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
5458
ns5541
ns0.99
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
5625
ns5625
ns1
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5625
ns5541
ns1.02
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
5583
ns5459
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1459917
ns1479917
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1499291
ns1515750
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1501417
ns1523083
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1439583
ns1448834
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5106812.5
ns5170937.5
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5286437.5
ns5319792
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5284041.5
ns5296208
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4996333.5
ns4989229.5
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3709
ns3667
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3750
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3666
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3750
ns3666
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15250
ns15458
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15417
ns15292
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15416
ns15500
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15000
ns15250
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71500
ns96375
ns0.74
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71333
ns104834
ns0.68
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
70542
ns94000
ns0.75
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
71250
ns92875
ns0.77
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
319958
ns319291
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
318333
ns326792
ns0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
318208
ns317083
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
321834
ns317375
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1000
ns1083
ns0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1084
ns1042
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
959
ns1000
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7916
ns8458
ns0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8208
ns8167
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8125
ns8500
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7667
ns8000
ns0.96
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
514834
ns536458.5
ns0.96
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
490208
ns514770.5
ns0.95
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
567542
ns583167
ns0.97
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
218520.5
ns177291.5
ns1.23
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1371833
ns1430708
ns0.96
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1457062.5
ns1491625
ns0.98
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1755667
ns1790583
ns0.98
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
909250
ns862187.5
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
292
ns375
ns0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
416
ns375
ns1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
333
ns375
ns0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns375
ns0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6166
ns6750
ns0.91
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6708
ns6458
ns1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6125
ns6666
ns0.92
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6083
ns6584
ns0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1721334
ns1721104
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1725146
ns1775187.5
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1724500
ns1796833.5
ns0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1728229.5
ns1760583
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4358375
ns4395271
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4376792
ns4422959
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4335333
ns4375792
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4390375
ns4339937.5
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6750
ns16708.5
ns0.40
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6625
ns7042
ns0.94
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
6875
ns8000
ns0.86
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6542
ns7125
ns0.92
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
32500
ns52520.5
ns0.62
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
50895.5
ns74791
ns0.68
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
32875
ns33083
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
49729
ns43000
ns1.16
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
17937.5
ns17333
ns1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
18042
ns17875
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
18125
ns18229.5
ns0.99
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
18458
ns17708
ns1.04
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
53208
ns53541.5
ns0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
53250
ns53500
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
53250
ns53500
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
53562.5
ns53542
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75709
ns102541.5
ns0.74
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75291
ns109541
ns0.69
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75208
ns99500
ns0.76
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75250
ns97875
ns0.77
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
330270.5
ns328250
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
328625
ns333084
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
325083
ns324125
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
329042
ns324041
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1486375
ns1504750
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1526375
ns1541208
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1527375
ns1549666
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1464666
ns1472416.5
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5175375
ns5156854.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5310021
ns5311833
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4950479
ns5311062.5
ns0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5010146
ns4595917
ns1.09
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28208
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28375
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28292
ns28125
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28250
ns28208
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66292
ns66917
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66375
ns66542
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66459
ns67750
ns0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66459
ns66500
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1396208.5
ns1505459
ns0.93
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1137042
ns959542
ns1.18
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1061959
ns1085458.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2245417
ns2196437.5
ns1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
2966209
ns3106250
ns0.95
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2741250
ns2641667
ns1.04
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2597667
ns2753084
ns0.94
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3844125
ns3807583
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
7918709
ns7926875
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
7905417
ns8046333.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
7547354
ns7926812.5
ns0.95
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
4916042
ns4419125
ns1.11
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80583
ns134333
ns0.60
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81458
ns140333
ns0.58
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
81541
ns135750
ns0.60
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80709
ns136000
ns0.59
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2026042
ns2042250
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2026125.5
ns2053604
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1719750
ns2031125
ns0.85
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2018208
ns2012625
ns1.00
This comment was automatically generated by workflow using github-action-benchmark.