-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathequal_fold_amd64.s
303 lines (283 loc) · 6.12 KB
/
equal_fold_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
// Code generated by command: go run equal_fold_asm.go -pkg ascii -out ../ascii/equal_fold_amd64.s -stubs ../ascii/equal_fold_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func EqualFoldString(a string, b string) bool
// Requires: AVX, AVX2, SSE4.1
TEXT ·EqualFoldString(SB), NOSPLIT, $0-33
MOVQ a_base+0(FP), CX
MOVQ a_len+8(FP), DX
MOVQ b_base+16(FP), BX
CMPQ DX, b_len+24(FP)
JNE done
XORQ AX, AX
CMPQ DX, $0x10
JB init_x86
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCS init_avx
init_x86:
LEAQ github·com∕segmentio∕asm∕ascii·lowerCase+0(SB), R9
XORL SI, SI
cmp8:
CMPQ DX, $0x08
JB cmp7
MOVBLZX (CX)(AX*1), DI
MOVBLZX (BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 1(CX)(AX*1), DI
MOVBLZX 1(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 2(CX)(AX*1), DI
MOVBLZX 2(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 3(CX)(AX*1), DI
MOVBLZX 3(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 4(CX)(AX*1), DI
MOVBLZX 4(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 5(CX)(AX*1), DI
MOVBLZX 5(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 6(CX)(AX*1), DI
MOVBLZX 6(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 7(CX)(AX*1), DI
MOVBLZX 7(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
JNE done
ADDQ $0x08, AX
SUBQ $0x08, DX
JMP cmp8
cmp7:
CMPQ DX, $0x07
JB cmp6
MOVBLZX 6(CX)(AX*1), DI
MOVBLZX 6(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp6:
CMPQ DX, $0x06
JB cmp5
MOVBLZX 5(CX)(AX*1), DI
MOVBLZX 5(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp5:
CMPQ DX, $0x05
JB cmp4
MOVBLZX 4(CX)(AX*1), DI
MOVBLZX 4(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp4:
CMPQ DX, $0x04
JB cmp3
MOVBLZX 3(CX)(AX*1), DI
MOVBLZX 3(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp3:
CMPQ DX, $0x03
JB cmp2
MOVBLZX 2(CX)(AX*1), DI
MOVBLZX 2(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp2:
CMPQ DX, $0x02
JB cmp1
MOVBLZX 1(CX)(AX*1), DI
MOVBLZX 1(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp1:
CMPQ DX, $0x01
JB success
MOVBLZX (CX)(AX*1), DI
MOVBLZX (BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
done:
SETEQ ret+32(FP)
RET
success:
MOVB $0x01, ret+32(FP)
RET
init_avx:
MOVB $0x20, SI
PINSRB $0x00, SI, X12
VPBROADCASTB X12, Y12
MOVB $0x1f, SI
PINSRB $0x00, SI, X13
VPBROADCASTB X13, Y13
MOVB $0x9a, SI
PINSRB $0x00, SI, X14
VPBROADCASTB X14, Y14
MOVB $0x01, SI
PINSRB $0x00, SI, X15
VPBROADCASTB X15, Y15
cmp128:
CMPQ DX, $0x80
JB cmp64
VMOVDQU (CX)(AX*1), Y0
VMOVDQU 32(CX)(AX*1), Y1
VMOVDQU 64(CX)(AX*1), Y2
VMOVDQU 96(CX)(AX*1), Y3
VMOVDQU (BX)(AX*1), Y4
VMOVDQU 32(BX)(AX*1), Y5
VMOVDQU 64(BX)(AX*1), Y6
VMOVDQU 96(BX)(AX*1), Y7
VXORPD Y0, Y4, Y4
VPCMPEQB Y12, Y4, Y8
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y8, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y4, Y0, Y0
VXORPD Y1, Y5, Y5
VPCMPEQB Y12, Y5, Y9
VORPD Y12, Y1, Y1
VPADDB Y13, Y1, Y1
VPCMPGTB Y1, Y14, Y1
VPAND Y9, Y1, Y1
VPAND Y15, Y1, Y1
VPSLLW $0x05, Y1, Y1
VPCMPEQB Y5, Y1, Y1
VXORPD Y2, Y6, Y6
VPCMPEQB Y12, Y6, Y10
VORPD Y12, Y2, Y2
VPADDB Y13, Y2, Y2
VPCMPGTB Y2, Y14, Y2
VPAND Y10, Y2, Y2
VPAND Y15, Y2, Y2
VPSLLW $0x05, Y2, Y2
VPCMPEQB Y6, Y2, Y2
VXORPD Y3, Y7, Y7
VPCMPEQB Y12, Y7, Y11
VORPD Y12, Y3, Y3
VPADDB Y13, Y3, Y3
VPCMPGTB Y3, Y14, Y3
VPAND Y11, Y3, Y3
VPAND Y15, Y3, Y3
VPSLLW $0x05, Y3, Y3
VPCMPEQB Y7, Y3, Y3
VPAND Y1, Y0, Y0
VPAND Y3, Y2, Y2
VPAND Y2, Y0, Y0
ADDQ $0x80, AX
SUBQ $0x80, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
JMP cmp128
cmp64:
CMPQ DX, $0x40
JB cmp32
VMOVDQU (CX)(AX*1), Y0
VMOVDQU 32(CX)(AX*1), Y1
VMOVDQU (BX)(AX*1), Y2
VMOVDQU 32(BX)(AX*1), Y3
VXORPD Y0, Y2, Y2
VPCMPEQB Y12, Y2, Y4
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y4, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y2, Y0, Y0
VXORPD Y1, Y3, Y3
VPCMPEQB Y12, Y3, Y5
VORPD Y12, Y1, Y1
VPADDB Y13, Y1, Y1
VPCMPGTB Y1, Y14, Y1
VPAND Y5, Y1, Y1
VPAND Y15, Y1, Y1
VPSLLW $0x05, Y1, Y1
VPCMPEQB Y3, Y1, Y1
VPAND Y1, Y0, Y0
ADDQ $0x40, AX
SUBQ $0x40, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
cmp32:
CMPQ DX, $0x20
JB cmp16
VMOVDQU (CX)(AX*1), Y0
VMOVDQU (BX)(AX*1), Y1
VXORPD Y0, Y1, Y1
VPCMPEQB Y12, Y1, Y2
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y2, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y1, Y0, Y0
ADDQ $0x20, AX
SUBQ $0x20, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
cmp16:
CMPQ DX, $0x10
JLE cmp_tail
VMOVDQU (CX)(AX*1), X0
VMOVDQU (BX)(AX*1), X1
VXORPD X0, X1, X1
VPCMPEQB X12, X1, X2
VORPD X12, X0, X0
VPADDB X13, X0, X0
VPCMPGTB X0, X14, X0
VPAND X2, X0, X0
VPAND X15, X0, X0
VPSLLW $0x05, X0, X0
VPCMPEQB X1, X0, X0
ADDQ $0x10, AX
SUBQ $0x10, DX
VPMOVMSKB X0, SI
XORL $0x0000ffff, SI
JNE done
cmp_tail:
SUBQ $0x10, DX
ADDQ DX, AX
VMOVDQU (CX)(AX*1), X0
VMOVDQU (BX)(AX*1), X1
VXORPD X0, X1, X1
VPCMPEQB X12, X1, X2
VORPD X12, X0, X0
VPADDB X13, X0, X0
VPCMPGTB X0, X14, X0
VPAND X2, X0, X0
VPAND X15, X0, X0
VPSLLW $0x05, X0, X0
VPCMPEQB X1, X0, X0
VPMOVMSKB X0, AX
XORL $0x0000ffff, AX
JMP done