diff --git a/_generate/gen.go b/_generate/gen.go index 52d306b..974ec83 100644 --- a/_generate/gen.go +++ b/_generate/gen.go @@ -31,8 +31,12 @@ import ( "github.com/mmcloughlin/avo/reg" ) -// insert extra checks here and there. -const debug = false +const ( + // insert extra checks here and there. + debug = false + // matchOffsetCMOV is true if we should use CMOV to check match offsets. + matchOffsetCMOV = true +) func main() { flag.Parse() @@ -180,6 +184,20 @@ func (r regTable) LoadIdx(idx, dst reg.GPVirtual) { } } +// Pretty bad performance. +func (r regTable) XchIdx(idx, val reg.GPVirtual) { + switch r.scale { + case 1: + XCHGB(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As8()) + case 2: + XCHGW(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As16()) + case 4: + XCHGL(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As32()) + default: + panic(r.scale) + } +} + func (r regTable) SaveIdx(val, idx reg.GPVirtual) { switch r.scale { case 1: @@ -383,6 +401,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m Load(Param("src").Base(), src) // Load cv + PCALIGN(16) Label("search_loop_" + name) candidate := GP32() { @@ -425,12 +444,20 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m ifok() return } - skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) - ccCounter++ - CMPL(cand.As32(), minPos.As32()) - JLE(LabelRef(skip)) - ifok() - Label(skip) + if matchOffsetCMOV { + // Use CMOV over JLE to avoid a jump. + // Intel seems to favor this. + CMPL(cand.As32(), minPos.As32()) + CMOVLLE(minPos.As32(), cand.As32()) + ifok() + } else { + skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) + ccCounter++ + CMPL(cand.As32(), minPos.As32()) + JLE(LabelRef(skip)) + ifok() + Label(skip) + } } assert(func(ok LabelRef) { // Check if s is valid (we should have jumped above if not) @@ -600,6 +627,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m } JMP(LabelRef("search_loop_" + name)) } + PCALIGN(16) Label("no_repeat_found_" + name) { // Check candidates are ok. All must be < s and < len(src) @@ -658,12 +686,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m JEQ(LabelRef("candidate2_match_" + name)) }) - // table[hash2] = uint32(s + 2) - table.SaveIdx(tmp, hash2) - // cv >>= 8 (>> 16 total) SHRQ(U8(8), cv) + // table[hash2] = uint32(s + 2) + table.SaveIdx(tmp, hash2) + // if uint32(cv>>16) == load32(src, candidate) checkCandidate(candidate, func() { CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) @@ -690,6 +718,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m } } + PCALIGN(16) Label("candidate_match_" + name) // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. // Extend backwards @@ -829,6 +858,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m o.emitLiteral("match_emit_"+name, litLen, nil, dst, litSrc, LabelRef("match_nolits_copy_"+name), true) } + PCALIGN(16) Label("match_nolits_copy_" + name) o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) Label("match_nolit_emitcopy_end_" + name) @@ -1167,6 +1197,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk Load(Param("src").Base(), src) // Load cv + PCALIGN(16) Label("search_loop_" + name) reloadTables("tmp", &sTab, &lTab) candidate := GP32() @@ -1226,28 +1257,24 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk // move nextS to stack. MOVL(nextS.As32(), nextSTempL) - candidateS := GP32() lHasher := hashN(o, lHashBytes, lTableBits) { - sHasher := hashN(o, sHashBytes, sTableBits) - hash0, hash1 := GP64(), GP64() + hash0 := GP64() MOVQ(cv, hash0) - MOVQ(cv, hash1) lHasher.hash(hash0) - sHasher.hash(hash1) - lTab.LoadIdx(hash0, candidate) - sTab.LoadIdx(hash1, candidateS) assert(func(ok LabelRef) { CMPQ(hash0, U32(lTableSize)) JB(ok) }) - assert(func(ok LabelRef) { - CMPQ(hash1, U32(sTableSize)) - JB(ok) - }) - - lTab.SaveIdx(s, hash0) - sTab.SaveIdx(s, hash1) + if false { + MOVL(s, candidate) + lTab.XchIdx(hash0, candidate) + } else { + // Load candidate from lTab + lTab.LoadIdx(hash0, candidate) + // Store s in lTab + lTab.SaveIdx(s, hash0) + } } // Check if offset exceeds max var ccCounter int @@ -1261,12 +1288,20 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk ifok() return } - skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) - ccCounter++ - CMPL(cand.As32(), minPos.As32()) - JLE(LabelRef(skip)) - ifok() - Label(skip) + if matchOffsetCMOV { + // Use CMOV over JLE to avoid a jump. + // Intel seems to favor this. + CMPL(cand.As32(), minPos.As32()) + CMOVLLE(minPos.As32(), cand.As32()) + ifok() + } else { + skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) + ccCounter++ + CMPL(cand.As32(), minPos.As32()) + JL(LabelRef(skip)) + ifok() + Label(skip) + } } longVal := GP64() shortVal := GP64() @@ -1278,12 +1313,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk JEQ(LabelRef("candidate_match_" + name)) }) - // Load short early... - checkCandidate(candidateS, func() { - MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal) - CMPQ(shortVal, cv.As64()) - }) - // En/disable repeat matching. if true { // Check repeat at offset checkRep @@ -1419,7 +1448,30 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk } JMP(LabelRef("search_loop_" + name)) } + PCALIGN(16) Label("no_repeat_found_" + name) + candidateS := GP32() + { + sHasher := hashN(o, sHashBytes, sTableBits) + hash1 := GP64() + MOVQ(cv, hash1) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JB(ok) + }) + if false { + MOVL(s.As32(), candidateS) + sTab.XchIdx(hash1, candidateS) + } else { + // Load candidateS from sTab + sTab.LoadIdx(hash1, candidateS) + // Store s in sTab + sTab.SaveIdx(s, hash1) + } + // Load short early... + MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal) + } { // Check candidates are ok. All must be < s and < len(src) assert(func(ok LabelRef) { @@ -1484,6 +1536,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk } } + PCALIGN(16) Label("candidate_match_" + name) // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. // Extend backwards @@ -2777,6 +2830,7 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en } if minMove <= 16 { + PCALIGN(16) Label(name + "move_8through16") if margin < 16 { MOVQ(Mem{Base: src}, AX) @@ -3555,6 +3609,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // LOOP + if !prefetch { + // Triggers https://github.com/golang/go/issues/74648 + //PCALIGN(16) + } Label(name + "_loop") CMPQ(src, srcLimit) JAE(LabelRef(name + "_end_copy")) @@ -3562,7 +3620,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr MOVQ(tag, value) SHRQ(U8(2), value) - Label(name + "_loop_nofetch") + if prefetch { + PCALIGN(16) + Label(name + "_loop_nofetch") + } // Check destination CMPQ(dst, dstLimit) JAE(LabelRef(name + "_end_copy")) @@ -3573,6 +3634,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JNZ(LabelRef(name + "_copy")) // TAG 00 Literals length := GP64() + PCALIGN(16) Label(name + "_lits") { MOVL(value.As32(), length.As32()) @@ -3585,6 +3647,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JMP(LabelRef(name + "_lit_3")) // Must be 31 // 1 - > 29 literals + PCALIGN(16) Label(name + "_lit_0") { INCQ(src) @@ -3709,6 +3772,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JMP(LabelRef(name + "_copy_3")) // TAG 1 - Copy 1 + PCALIGN(16) Label(name + "_copy_1") { if o.inputMargin < 2 { @@ -3771,6 +3835,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // TAG 2 - Copy 2 + PCALIGN(16) Label(name + "_copy_2") { // length = int(src[s-3]) >> 2 @@ -3862,6 +3927,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } } // TAG 3 - Copy 2/3 fused + PCALIGN(16) Label(name + "_copy_3") { if o.inputMargin < 4 { @@ -4048,6 +4114,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // Length always < 64 copySrc := GP64() + PCALIGN(16) Label(name + "_copy_exec_short") { CMPL(offset.As32(), dstPos.As32()) @@ -4067,6 +4134,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr o.outputMargin -= 4 // 64 offset, 64 length + PCALIGN(16) Label(name + "_copy_exec_long_long") { MOVQ(dst, copySrc) @@ -4080,8 +4148,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } o.genMemMoveLong64(name+"_copy_long_long", dst, copySrc, length, LabelRef(name+"_copy_done")) } + // length 4 -> 64, no overlap // Very hot (16 byte copy mainly) + PCALIGN(16) Label(name + "_copy_short_no_ol") { // Create source pointer with offset @@ -4097,6 +4167,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr o.genMemMoveShort(name+"_copy_short_no_ol", dst, copySrc, length, LabelRef(name+"_copy_done"), 4) } // Offset anything, length anything + PCALIGN(16) Label(name + "_copy_exec") { CMPL(offset.As32(), dstPos.As32()) @@ -4273,3 +4344,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr Label(name + "_end_copy") Label(name + "_end_done") } + +func PCALIGN(n int) { + Instruction(&ir.Instruction{ + Opcode: "PCALIGN", + Operands: []Op{Imm(uint64(n))}, + }) +} diff --git a/asm_amd64.s b/asm_amd64.s index c18e6c4..7b87b9d 100644 --- a/asm_amd64.s +++ b/asm_amd64.s @@ -14,29 +14,30 @@ TEXT ·encodeBlockAsm(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm: MOVL DX, SI @@ -151,6 +152,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16: MOVOU (R8), X0 @@ -369,42 +371,38 @@ repeat_one_match_repeat_encodeBlockAsm: repeat_end_emit_encodeBlockAsm: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm + PCALIGN $0x10 no_repeat_found_encodeBlockAsm: - CMPL SI, R8 - JLE offset_ok_0_encodeBlockAsm - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm - -offset_ok_0_encodeBlockAsm: - SHRQ $0x08, DI - MOVL (AX)(R11*4), SI - LEAL 2(DX), R10 - CMPL R9, R8 - JLE offset_ok_1_encodeBlockAsm - CMPL (BX)(R9*1), DI - JEQ candidate2_match_encodeBlockAsm - -offset_ok_1_encodeBlockAsm: - MOVL R10, (AX)(R11*4) - SHRQ $0x08, DI - CMPL SI, R8 - JLE offset_ok_2_encodeBlockAsm - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm - -offset_ok_2_encodeBlockAsm: - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm + CMPL SI, R8 + CMOVLLE R8, SI + CMPL (BX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm + SHRQ $0x08, DI + MOVL (AX)(R11*4), SI + LEAL 2(DX), R10 + CMPL R9, R8 + CMOVLLE R8, R9 + CMPL (BX)(R9*1), DI + JEQ candidate2_match_encodeBlockAsm + SHRQ $0x08, DI + MOVL R10, (AX)(R11*4) + CMPL SI, R8 + CMOVLLE R8, SI + CMPL (BX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm + MOVL 20(SP), DX + JMP search_loop_encodeBlockAsm candidate3_match_encodeBlockAsm: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: - MOVL R10, (AX)(R11*4) - INCL DX - MOVL R9, SI + MOVL R10, (AX)(R11*4) + INCL DX + MOVL R9, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm: MOVL 12(SP), DI @@ -677,6 +675,7 @@ one_byte_match_emit_encodeBlockAsm: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: MOVOU (DI), X0 @@ -777,6 +776,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm: // emitCopy @@ -1132,6 +1133,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: MOVQ (DX), SI @@ -1251,29 +1253,30 @@ TEXT ·encodeBlockAsm2MB(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm2MB: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm2MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm2MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm2MB: MOVL DX, SI @@ -1387,6 +1390,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm2MB: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -1605,6 +1609,7 @@ repeat_one_match_repeat_encodeBlockAsm2MB: repeat_end_emit_encodeBlockAsm2MB: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm2MB + PCALIGN $0x10 no_repeat_found_encodeBlockAsm2MB: CMPL (BX)(SI*1), DI @@ -1614,8 +1619,8 @@ no_repeat_found_encodeBlockAsm2MB: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm2MB - MOVL R9, (AX)(R10*4) SHRQ $0x08, DI + MOVL R9, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm2MB MOVL 20(SP), DX @@ -1626,9 +1631,10 @@ candidate3_match_encodeBlockAsm2MB: JMP candidate_match_encodeBlockAsm2MB candidate2_match_encodeBlockAsm2MB: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI + MOVL R9, (AX)(R10*4) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm2MB: MOVL 12(SP), DI @@ -1901,6 +1907,7 @@ one_byte_match_emit_encodeBlockAsm2MB: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16: MOVOU (DI), X0 @@ -2001,6 +2008,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm2MB + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm2MB: // emitCopy @@ -2350,6 +2359,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16: MOVQ (DX), SI @@ -2469,29 +2479,30 @@ TEXT ·encodeBlockAsm512K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm512K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm512K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm512K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm512K: MOVL DX, SI @@ -2605,6 +2616,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm512K: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16: MOVOU (R8), X0 @@ -2823,6 +2835,7 @@ repeat_one_match_repeat_encodeBlockAsm512K: repeat_end_emit_encodeBlockAsm512K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm512K + PCALIGN $0x10 no_repeat_found_encodeBlockAsm512K: CMPL (BX)(SI*1), DI @@ -2832,8 +2845,8 @@ no_repeat_found_encodeBlockAsm512K: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm512K - MOVL R9, (AX)(R10*4) SHRQ $0x08, DI + MOVL R9, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm512K MOVL 20(SP), DX @@ -2844,9 +2857,10 @@ candidate3_match_encodeBlockAsm512K: JMP candidate_match_encodeBlockAsm512K candidate2_match_encodeBlockAsm512K: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI + MOVL R9, (AX)(R10*4) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm512K: MOVL 12(SP), DI @@ -3119,6 +3133,7 @@ one_byte_match_emit_encodeBlockAsm512K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16: MOVOU (DI), X0 @@ -3219,6 +3234,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm512K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm512K: // emitCopy @@ -3568,6 +3585,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16: MOVQ (DX), SI @@ -3687,29 +3705,30 @@ TEXT ·encodeBlockAsm64K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm64K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm64K: MOVL DX, SI @@ -3822,6 +3841,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm64K: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16: MOVOU (R8), X0 @@ -4040,6 +4060,7 @@ repeat_one_match_repeat_encodeBlockAsm64K: repeat_end_emit_encodeBlockAsm64K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm64K + PCALIGN $0x10 no_repeat_found_encodeBlockAsm64K: CMPL (BX)(SI*1), DI @@ -4049,8 +4070,8 @@ no_repeat_found_encodeBlockAsm64K: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm64K - MOVW R9, (AX)(R10*2) SHRQ $0x08, DI + MOVW R9, (AX)(R10*2) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm64K MOVL 20(SP), DX @@ -4061,9 +4082,10 @@ candidate3_match_encodeBlockAsm64K: JMP candidate_match_encodeBlockAsm64K candidate2_match_encodeBlockAsm64K: - MOVW R9, (AX)(R10*2) - INCL DX - MOVL R8, SI + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm64K: MOVL 12(SP), DI @@ -4289,6 +4311,7 @@ one_byte_match_emit_encodeBlockAsm64K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16: MOVOU (DI), X0 @@ -4389,6 +4412,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm64K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm64K: // emitCopy @@ -4694,6 +4719,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16: MOVQ (DX), SI @@ -4813,29 +4839,30 @@ TEXT ·encodeBlockAsm16K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm16K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm16K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm16K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm16K: MOVL DX, SI @@ -4940,6 +4967,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm16K: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16: MOVOU (R8), X0 @@ -5158,6 +5186,7 @@ repeat_one_match_repeat_encodeBlockAsm16K: repeat_end_emit_encodeBlockAsm16K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm16K + PCALIGN $0x10 no_repeat_found_encodeBlockAsm16K: CMPL (BX)(SI*1), DI @@ -5167,8 +5196,8 @@ no_repeat_found_encodeBlockAsm16K: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm16K - MOVW R9, (AX)(R10*2) SHRQ $0x08, DI + MOVW R9, (AX)(R10*2) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm16K MOVL 20(SP), DX @@ -5179,9 +5208,10 @@ candidate3_match_encodeBlockAsm16K: JMP candidate_match_encodeBlockAsm16K candidate2_match_encodeBlockAsm16K: - MOVW R9, (AX)(R10*2) - INCL DX - MOVL R8, SI + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm16K: MOVL 12(SP), DI @@ -5399,6 +5429,7 @@ one_byte_match_emit_encodeBlockAsm16K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16: MOVOU (DI), X0 @@ -5499,6 +5530,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm16K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm16K: // emitCopy @@ -5796,6 +5829,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16: MOVQ (DX), SI @@ -5915,29 +5949,30 @@ TEXT ·encodeBlockAsm4K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm4K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm4K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm4K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm4K: MOVL DX, SI @@ -6039,6 +6074,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm4K: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16: MOVOU (R8), X0 @@ -6257,6 +6293,7 @@ repeat_one_match_repeat_encodeBlockAsm4K: repeat_end_emit_encodeBlockAsm4K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm4K + PCALIGN $0x10 no_repeat_found_encodeBlockAsm4K: CMPL (BX)(SI*1), DI @@ -6266,8 +6303,8 @@ no_repeat_found_encodeBlockAsm4K: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm4K - MOVW R9, (AX)(R10*2) SHRQ $0x08, DI + MOVW R9, (AX)(R10*2) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm4K MOVL 20(SP), DX @@ -6278,9 +6315,10 @@ candidate3_match_encodeBlockAsm4K: JMP candidate_match_encodeBlockAsm4K candidate2_match_encodeBlockAsm4K: - MOVW R9, (AX)(R10*2) - INCL DX - MOVL R8, SI + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm4K: MOVL 12(SP), DI @@ -6498,6 +6536,7 @@ one_byte_match_emit_encodeBlockAsm4K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16: MOVOU (DI), X0 @@ -6598,6 +6637,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm4K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm4K: // emitCopy @@ -6893,6 +6934,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16: MOVQ (DX), SI @@ -7012,29 +7054,30 @@ TEXT ·encodeBlockAsm1K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBlockAsm1K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm1K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -17(DX), BX - LEAQ -17(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeBlockAsm1K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x05, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + PCALIGN $0x10 search_loop_encodeBlockAsm1K: MOVL DX, SI @@ -7136,6 +7179,7 @@ one_byte_repeat_emit_lits_encodeBlockAsm1K: CMPQ SI, $0x20 JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16: MOVOU (R8), X0 @@ -7354,6 +7398,7 @@ repeat_one_match_repeat_encodeBlockAsm1K: repeat_end_emit_encodeBlockAsm1K: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm1K + PCALIGN $0x10 no_repeat_found_encodeBlockAsm1K: CMPL (BX)(SI*1), DI @@ -7363,8 +7408,8 @@ no_repeat_found_encodeBlockAsm1K: LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm1K - MOVW R9, (AX)(R10*2) SHRQ $0x08, DI + MOVW R9, (AX)(R10*2) CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm1K MOVL 20(SP), DX @@ -7375,9 +7420,10 @@ candidate3_match_encodeBlockAsm1K: JMP candidate_match_encodeBlockAsm1K candidate2_match_encodeBlockAsm1K: - MOVW R9, (AX)(R10*2) - INCL DX - MOVL R8, SI + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBlockAsm1K: MOVL 12(SP), DI @@ -7595,6 +7641,7 @@ one_byte_match_emit_encodeBlockAsm1K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16: MOVOU (DI), X0 @@ -7695,6 +7742,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm1K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm1K: // emitCopy @@ -7990,6 +8039,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16: MOVQ (DX), SI @@ -8108,1147 +8158,1140 @@ TEXT ·encodeBetterBlockAsm(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -17(AX), DX - LEAQ -17(AX), DI - MOVL DI, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -17(AX), DX + LEAQ -17(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm: - MOVQ tmp+48(FP), DI - MOVL AX, R8 - SUBL 12(SP), R8 - SHRL $0x08, R8 - CMPL R8, $0x63 + MOVQ tmp+48(FP), BX + MOVL AX, SI + SUBL 12(SP), SI + SHRL $0x08, SI + CMPL SI, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(AX), R8 + LEAL 100(AX), SI JMP check_maxskip_cont_encodeBetterBlockAsm check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(AX)(R8*1), R8 + LEAL 1(AX)(SI*1), SI check_maxskip_cont_encodeBetterBlockAsm: - CMPL R8, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - MOVQ (DX)(AX*1), R9 - MOVL R8, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R11 - MOVQ $0x9e3779b1, R8 - MOVQ R9, R12 - MOVQ R9, R13 - SHLQ $0x08, R12 - IMULQ R11, R12 - SHRQ $0x2f, R12 - IMULQ R8, R13 - SHRQ $0x32, R13 - MOVL (DI)(R12*4), R8 - MOVL 524288(DI)(R13*4), R10 - MOVL AX, (DI)(R12*4) - MOVL AX, 524288(DI)(R13*4) - LEAL -2162685(AX), R12 - CMPL R8, R12 - JLE offset_ok_0_encodeBetterBlockAsm - MOVQ (DX)(R8*1), BX - CMPQ BX, R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_0_encodeBetterBlockAsm: - CMPL R10, R12 - JLE offset_ok_1_encodeBetterBlockAsm - MOVQ (DX)(R10*1), SI - CMPQ SI, R9 - -offset_ok_1_encodeBetterBlockAsm: - MOVL AX, R13 - SUBL 16(SP), R13 - MOVQ (DX)(R13*1), R13 - MOVQ $0x000000ffffffff00, R14 - XORQ R9, R13 - TESTQ R14, R13 - JNE no_repeat_found_encodeBetterBlockAsm - LEAL 1(AX), DI - MOVL 12(SP), R8 - MOVL DI, R9 - SUBL 16(SP), R9 - JZ repeat_extend_back_end_encodeBetterBlockAsm + CMPL SI, 8(SP) + JAE emit_remainder_encodeBetterBlockAsm + MOVQ (DX)(AX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ DI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x2f, R9 + MOVL (BX)(R9*4), SI + MOVL AX, (BX)(R9*4) + LEAL -2162685(AX), R9 + CMPL SI, R9 + CMOVLLE R9, SI + MOVQ (DX)(SI*1), R10 + CMPQ R10, DI + JEQ candidate_match_encodeBetterBlockAsm + MOVL AX, R11 + SUBL 16(SP), R11 + MOVQ (DX)(R11*1), R11 + MOVQ $0x000000ffffffff00, R12 + XORQ DI, R11 + TESTQ R12, R11 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(AX), BX + MOVL 12(SP), SI + MOVL BX, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm repeat_extend_back_loop_encodeBetterBlockAsm: - CMPL DI, R8 + CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(R9*1), R10 - MOVB -1(DX)(DI*1), R11 - CMPB R10, R11 + MOVB -1(DX)(DI*1), R8 + MOVB -1(DX)(BX*1), R9 + CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm - LEAL -1(DI), DI - DECL R9 + LEAL -1(BX), BX + DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm repeat_extend_back_end_encodeBetterBlockAsm: - MOVL DI, R8 - SUBL 12(SP), R8 - LEAQ 4(CX)(R8*1), R8 - CMPQ R8, (SP) + MOVL BX, SI + SUBL 12(SP), SI + LEAQ 4(CX)(SI*1), SI + CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm: // emitLiteralsDstP - MOVL 12(SP), R8 - CMPL R8, DI + MOVL 12(SP), SI + CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), R10 - SUBL R8, R9 + MOVL BX, DI + MOVL BX, 12(SP) + LEAQ (DX)(SI*1), R8 + SUBL SI, DI // emitLiteral - LEAL -1(R9), R8 - CMPL R8, $0x1d + LEAL -1(DI), SI + CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm - SUBL $0x1d, R8 - CMPL R8, $0x00000100 + SUBL $0x1d, SI + CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm - CMPL R8, $0x00010000 + CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBetterBlockAsm - MOVL R8, R11 - SHRL $0x10, R11 + MOVL SI, R9 + SHRL $0x10, R9 MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R11, 3(CX) + MOVW SI, 1(CX) + MOVB R9, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm three_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm two_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R8, 1(CX) - ADDL $0x1d, R8 + MOVB SI, 1(CX) + ADDL $0x1d, SI ADDQ $0x02, CX - CMPL R8, $0x40 + CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm JMP memmove_long_repeat_emit_encodeBetterBlockAsm one_byte_repeat_emit_encodeBetterBlockAsm: - SHLB $0x03, R8 - MOVB R8, (CX) + SHLB $0x03, SI + MOVB SI, (CX) ADDQ $0x01, CX - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 - CMPQ R9, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_midrepeat_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_long_repeat_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + ADDQ $0x20, R12 + ADDQ $0x20, R9 ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 + DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(CX)(R11*1) + MOVOA X5, -16(CX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) + MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm: ADDL $0x05, AX - MOVL AX, R8 - SUBL 16(SP), R8 - MOVQ src_len+32(FP), R9 - SUBL AX, R9 - LEAQ (DX)(AX*1), R10 - LEAQ (DX)(R8*1), R8 + MOVL AX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), DI + SUBL AX, DI + LEAQ (DX)(AX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R12, R12 + XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm: - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (R8)(R12*1), R11 + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm - XORQ 8(R8)(R12*1), R13 + XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm - LEAL -16(R9), R9 - LEAL 16(R12), R12 + LEAL -16(DI), DI + LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x10 + CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm matchlen_bsf_16repeat_extend_encodeBetterBlockAsm: - TZCNTQ R13, R13 - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 + TZCNTQ R11, R11 + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match8_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x08 + CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm - MOVQ (R10)(R12*1), R11 - XORQ (R8)(R12*1), R11 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm - LEAL -8(R9), R9 - LEAL 8(R12), R12 + LEAL -8(DI), DI + LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm: - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 + TZCNTQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match4_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x04 + CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm - MOVL (R10)(R12*1), R11 - CMPL (R8)(R12*1), R11 + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm - LEAL -4(R9), R9 - LEAL 4(R12), R12 + LEAL -4(DI), DI + LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x01 + CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm JB repeat_extend_forward_end_encodeBetterBlockAsm - MOVW (R10)(R12*1), R11 - CMPW (R8)(R12*1), R11 + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm - LEAL 2(R12), R12 - SUBL $0x02, R9 + LEAL 2(R10), R10 + SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match1_repeat_extend_encodeBetterBlockAsm: - MOVB (R10)(R12*1), R11 - CMPB (R8)(R12*1), R11 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm - LEAL 1(R12), R12 + LEAL 1(R10), R10 repeat_extend_forward_end_encodeBetterBlockAsm: - ADDL R12, AX - MOVL AX, R8 - SUBL DI, R8 - MOVL 16(SP), DI + ADDL R10, AX + MOVL AX, SI + SUBL BX, SI + MOVL 16(SP), BX // emitRepeat - LEAL -1(R8), DI - CMPL R8, $0x1d + LEAL -1(SI), BX + CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm - LEAL -30(R8), DI - CMPL R8, $0x0000011e + LEAL -30(SI), BX + CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm - CMPL R8, $0x0001001e + CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL DI, 1(CX) + MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_three_match_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW DI, 1(CX) + MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_two_match_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB DI, 1(CX) + MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_one_match_repeat_encodeBetterBlockAsm: - XORL DI, DI - LEAL -4(DI)(R8*8), DI - MOVB DI, (CX) + XORL BX, BX + LEAL -4(BX)(SI*8), BX + MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm: - CMPL R8, R12 - JLE offset_ok_2_encodeBetterBlockAsm - CMPL BX, R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_2_encodeBetterBlockAsm: - CMPL R10, R12 - JLE offset_ok_3_encodeBetterBlockAsm - CMPL SI, R9 - JEQ candidateS_match_encodeBetterBlockAsm - -offset_ok_3_encodeBetterBlockAsm: - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm + MOVQ $0x9e3779b1, R12 + MOVQ DI, R11 + IMULQ R12, R11 + SHRQ $0x32, R11 + MOVL 524288(BX)(R11*4), R12 + MOVL AX, 524288(BX)(R11*4) + MOVQ (DX)(R12*1), R11 + CMPL SI, R9 + CMOVLLE R9, SI + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL R12, R9 + CMOVLLE R9, R12 + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, R9 - MOVQ R9, R13 - SHLQ $0x08, R13 - IMULQ R11, R13 - SHRQ $0x2f, R13 - MOVL (DI)(R13*4), R8 - INCL AX - MOVL AX, (DI)(R13*4) - CMPL R8, R12 - JLE offset_ok_4_encodeBetterBlockAsm - CMPL (DX)(R8*1), R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_4_encodeBetterBlockAsm: - DECL AX - MOVL R10, R8 + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R8, R10 + SHRQ $0x2f, R10 + MOVL (BX)(R10*4), SI + INCL AX + MOVL AX, (BX)(R10*4) + CMPL SI, R9 + CMOVLLE R9, SI + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + DECL AX + MOVL R12, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), DI - TESTL R8, R8 + MOVL 12(SP), BX + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: - CMPL AX, DI + CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(R8*1), R9 - MOVB -1(DX)(AX*1), R10 - CMPB R9, R10 + MOVB -1(DX)(SI*1), DI + MOVB -1(DX)(AX*1), R8 + CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(AX), AX - DECL R8 + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: - MOVL AX, DI - SUBL 12(SP), DI - LEAQ 4(CX)(DI*1), DI - CMPQ DI, (SP) + MOVL AX, BX + SUBL 12(SP), BX + LEAQ 4(CX)(BX*1), BX + CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm: - MOVL AX, DI + MOVL AX, BX ADDL $0x04, AX - ADDL $0x04, R8 - MOVQ src_len+32(FP), R9 - SUBL AX, R9 - LEAQ (DX)(AX*1), R10 - LEAQ (DX)(R8*1), R11 + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL AX, DI + LEAQ (DX)(AX*1), R8 + LEAQ (DX)(SI*1), R9 // matchLen - XORL R13, R13 + XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: - MOVQ (R10)(R13*1), R12 - MOVQ 8(R10)(R13*1), R14 - XORQ (R11)(R13*1), R12 + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - XORQ 8(R11)(R13*1), R14 + XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm - LEAL -16(R9), R9 - LEAL 16(R13), R13 + LEAL -16(DI), DI + LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x10 + CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm JMP matchlen_match8_match_nolit_encodeBetterBlockAsm matchlen_bsf_16match_nolit_encodeBetterBlockAsm: - TZCNTQ R14, R14 - SARQ $0x03, R14 - LEAL 8(R13)(R14*1), R13 + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match8_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x08 + CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm - MOVQ (R10)(R13*1), R12 - XORQ (R11)(R13*1), R12 + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - LEAL -8(R9), R9 - LEAL 8(R13), R13 + LEAL -8(DI), DI + LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL (R13)(R12*1), R13 + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match4_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x04 + CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm - MOVL (R10)(R13*1), R12 - CMPL (R11)(R13*1), R12 + MOVL (R8)(R11*1), R10 + CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm - LEAL -4(R9), R9 - LEAL 4(R13), R13 + LEAL -4(DI), DI + LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x01 + CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm JB match_nolit_end_encodeBetterBlockAsm - MOVW (R10)(R13*1), R12 - CMPW (R11)(R13*1), R12 + MOVW (R8)(R11*1), R10 + CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm - LEAL 2(R13), R13 - SUBL $0x02, R9 + LEAL 2(R11), R11 + SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm matchlen_match1_match_nolit_encodeBetterBlockAsm: - MOVB (R10)(R13*1), R12 - CMPB (R11)(R13*1), R12 + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R13), R13 + LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm: - MOVL AX, R9 - SUBL R8, R9 - CMPL R13, $0x01 + MOVL AX, DI + SUBL SI, DI + CMPL R11, $0x01 JA match_length_ok_encodeBetterBlockAsm - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JBE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), AX INCL AX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: - MOVL R9, 16(SP) + MOVL DI, 16(SP) // Check if we can combine lit+copy - MOVLQZX 12(SP), R10 - MOVL DI, R8 - SUBL R10, R8 + MOVLQZX 12(SP), R8 + MOVL BX, SI + SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm - CMPL R9, $0x00000040 + CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JA match_emit_copy3_encodeBetterBlockAsm - CMPL R8, $0x04 + CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm - MOVL (DX)(R10*1), R10 - ADDL R13, AX - ADDL $0x04, R13 + MOVL (DX)(R8*1), R8 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits - XORQ R11, R11 - SUBL $0x40, R9 - LEAL -11(R13), R12 - LEAL -4(R13), R13 - MOVW R9, 1(CX) - CMPL R13, $0x07 - CMOVLGE R12, R11 - MOVQ $0x00000007, R9 - CMOVLLT R13, R9 - LEAL -1(R8)(R9*4), R9 - MOVL $0x00000003, R12 - LEAL (R12)(R9*8), R9 - MOVB R9, (CX) + XORQ R9, R9 + SUBL $0x40, DI + LEAL -11(R11), R10 + LEAL -4(R11), R11 + MOVW DI, 1(CX) + CMPL R11, $0x07 + CMOVLGE R10, R9 + MOVQ $0x00000007, DI + CMOVLLT R11, DI + LEAL -1(SI)(DI*4), DI + MOVL $0x00000003, R10 + LEAL (R10)(DI*8), DI + MOVB DI, (CX) ADDQ $0x03, CX - MOVL R10, (CX) - ADDQ R8, CX - TESTL R11, R11 + MOVL R8, (CX) + ADDQ SI, CX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm // emitRepeat - LEAL -1(R11), R8 - CMPL R11, $0x1d + LEAL -1(R9), SI + CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm - LEAL -30(R11), R8 - CMPL R11, $0x0000011e + LEAL -30(R9), SI + CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm - CMPL R11, $0x0001001e + CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R11*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R9*8), SI + MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_copy3_encodeBetterBlockAsm: - CMPL R8, $0x03 + CMPL SI, $0x03 JA match_emit_lits_encodeBetterBlockAsm - MOVLQZX 12(SP), R10 - MOVL (DX)(R10*1), R10 - ADDL R13, AX - ADDL $0x04, R13 + MOVLQZX 12(SP), R8 + MOVL (DX)(R8*1), R8 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy3 - LEAL -4(R13), R13 - LEAL -65536(R9), R9 - SHLL $0x0b, R9 - LEAL 7(R9)(R8*8), R9 - CMPL R13, $0x3c + LEAL -4(R11), R11 + LEAL -65536(DI), DI + SHLL $0x0b, DI + LEAL 7(DI)(SI*8), DI + CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm - LEAL -60(R13), R11 - CMPL R13, $0x0000013c + LEAL -60(R11), R9 + CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm - ADDL $0x000007e0, R9 - MOVL R9, (CX) - MOVL R11, 4(CX) + ADDL $0x000007e0, DI + MOVL DI, (CX) + MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_2_match_emit_lits_encodeBetterBlockAsm: - ADDL $0x000007c0, R9 - MOVL R9, (CX) - MOVW R11, 4(CX) + ADDL $0x000007c0, DI + MOVL DI, (CX) + MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_1_match_emit_lits_encodeBetterBlockAsm: - ADDL $0x000007a0, R9 - MOVL R9, (CX) - MOVB R11, 4(CX) + ADDL $0x000007a0, DI + MOVL DI, (CX) + MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_0_match_emit_lits_encodeBetterBlockAsm: - SHLL $0x05, R13 - ORL R13, R9 - MOVL R9, (CX) + SHLL $0x05, R11 + ORL R11, DI + MOVL DI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBetterBlockAsm: - MOVL R10, (CX) - ADDQ R8, CX + MOVL R8, (CX) + ADDQ SI, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_lits_encodeBetterBlockAsm: - LEAQ (DX)(R10*1), R10 + LEAQ (DX)(R8*1), R8 // emitLiteral - LEAL -1(R8), R11 - CMPL R11, $0x1d + LEAL -1(SI), R9 + CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm - SUBL $0x1d, R11 - CMPL R11, $0x00000100 + SUBL $0x1d, R9 + CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm - CMPL R11, $0x00010000 + CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm - MOVL R11, R12 - SHRL $0x10, R12 + MOVL R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (CX) - MOVW R11, 1(CX) - MOVB R12, 3(CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R11 + ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R11, 1(CX) + MOVW R9, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R11 + ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R11, 1(CX) - ADDL $0x1d, R11 + MOVB R9, 1(CX) + ADDL $0x1d, R9 ADDQ $0x02, CX - CMPL R11, $0x40 + CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x03, R11 - MOVB R11, (CX) + SHLB $0x03, R9 + MOVB R9, (CX) ADDQ $0x01, CX - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 - CMPQ R8, $0x10 + CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R8*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) + MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ R11, CX + MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_midmatch_emit_encodeBetterBlockAsm: - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 - CMPQ R8, $0x20 + CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R8*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) + MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ R11, CX + MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 - MOVQ R8, R14 - SHRQ $0x05, R14 - MOVQ CX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R15 - SUBQ R12, R15 - DECQ R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R15*1), R12 - LEAQ -32(CX)(R15*1), BP + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (BP) - MOVOA X5, 16(BP) - ADDQ $0x20, BP - ADDQ $0x20, R12 - ADDQ $0x20, R15 - DECQ R14 + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R15*1), X4 - MOVOU -16(R10)(R15*1), X5 - MOVOA X4, -32(CX)(R15*1) - MOVOA X5, -16(CX)(R15*1) - ADDQ $0x20, R15 - CMPQ R8, R15 + MOVOU -32(R8)(R13*1), X4 + MOVOU -16(R8)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ R11, CX + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm: - ADDL R13, AX - ADDL $0x04, R13 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBetterBlockAsm // emitCopy3 - LEAL -4(R13), R13 - LEAL -65536(R9), R8 - SHLL $0x0b, R8 - ADDL $0x07, R8 - CMPL R13, $0x3c + LEAL -4(R11), R11 + LEAL -65536(DI), SI + SHLL $0x0b, SI + ADDL $0x07, SI + CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3 - LEAL -60(R13), R9 - CMPL R13, $0x0000013c + LEAL -60(R11), DI + CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3 - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3 - ADDL $0x000007e0, R8 - MOVL R8, (CX) - MOVL R9, 4(CX) + ADDL $0x000007e0, SI + MOVL SI, (CX) + MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3: - ADDL $0x000007c0, R8 - MOVL R8, (CX) - MOVW R9, 4(CX) + ADDL $0x000007c0, SI + MOVL SI, (CX) + MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3: - ADDL $0x000007a0, R8 - MOVL R8, (CX) - MOVB R9, 4(CX) + ADDL $0x000007a0, SI + MOVL SI, (CX) + MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3: - SHLL $0x05, R13 - ORL R13, R8 - MOVL R8, (CX) + SHLL $0x05, R11 + ORL R11, SI + MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x00000400 + CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm - CMPL R13, $0x00000013 + CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL -15(R8)(R13*4), R8 - MOVW R8, (CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_one_longer_match_nolit_encodeBetterBlockAsm: - CMPL R13, $0x00000112 + CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL 61(R8), R8 - MOVW R8, (CX) - LEAL -18(R13), R8 - MOVB R8, 2(CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy1_repeat_match_nolit_encodeBetterBlockAsm: - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL 57(R8), R8 - MOVW R8, (CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) ADDQ $0x02, CX - SUBL $0x12, R13 + SUBL $0x12, R11 // emitRepeat - LEAL -1(R13), R8 - CMPL R13, $0x1d + LEAL -1(R11), SI + CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm - LEAL -30(R13), R8 - CMPL R13, $0x0000011e + LEAL -30(R11), SI + CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm - CMPL R13, $0x0001001e + CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R13*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_match_nolit_encodeBetterBlockAsm: // emitCopy2 - LEAL -64(R9), R9 - LEAL -4(R13), R13 - MOVW R9, 1(CX) - CMPL R13, $0x3c + LEAL -64(DI), DI + LEAL -4(R11), R11 + MOVW DI, 1(CX) + CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2 - LEAL -60(R13), R8 - CMPL R13, $0x0000013c + LEAL -60(R11), SI + CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2 - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2 MOVB $0xfe, (CX) - MOVL R8, 3(CX) + MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xfa, (CX) - MOVW R8, 3(CX) + MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xf6, (CX) - MOVB R8, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2: - MOVL $0x00000002, R8 - LEAL (R8)(R13*4), R8 - MOVB R8, (CX) + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm // emitLiteralsDstP - MOVL 12(SP), R8 - CMPL R8, DI + MOVL 12(SP), SI + CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), R10 - SUBL R8, R9 + MOVL BX, DI + MOVL BX, 12(SP) + LEAQ (DX)(SI*1), R8 + SUBL SI, DI // emitLiteral - LEAL -1(R9), R8 - CMPL R8, $0x1d + LEAL -1(DI), SI + CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm - SUBL $0x1d, R8 - CMPL R8, $0x00000100 + SUBL $0x1d, SI + CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL R8, $0x00010000 + CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVL R8, R11 - SHRL $0x10, R11 + MOVL SI, R9 + SHRL $0x10, R9 MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R11, 3(CX) + MOVW SI, 1(CX) + MOVB R9, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R8, 1(CX) - ADDL $0x1d, R8 + MOVB SI, 1(CX) + ADDL $0x1d, SI ADDQ $0x02, CX - CMPL R8, $0x40 + CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x03, R8 - MOVB R8, (CX) + SHLB $0x03, SI + MOVB SI, (CX) ADDQ $0x01, CX - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 - CMPQ R9, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_midmatch_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R12 + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) + MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R13, AX - ADDL $0x04, R13 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat - LEAL -1(R13), R8 - CMPL R13, $0x1d + LEAL -1(R11), SI + CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm - LEAL -30(R13), R8 - CMPL R13, $0x0000011e + LEAL -30(R11), SI + CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL R13, $0x0001001e + CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_nolit_repeat_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R13*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm: @@ -9260,51 +9303,51 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: RET match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ tmp+48(FP), R8 - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, R10 - LEAQ 1(DI), DI - LEAQ -2(AX), R11 - MOVQ (DX)(DI*1), R12 - MOVQ 1(DX)(DI*1), R13 - MOVQ (DX)(R11*1), R14 - MOVQ 1(DX)(R11*1), R15 + MOVQ tmp+48(FP), SI + MOVQ $0x00cf1bbcdcbfa563, DI + MOVQ $0x9e3779b1, R8 + LEAQ 1(BX), BX + LEAQ -2(AX), R9 + MOVQ (DX)(BX*1), R10 + MOVQ 1(DX)(BX*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 + SHLQ $0x08, R10 + IMULQ DI, R10 + SHRQ $0x2f, R10 + IMULQ R8, R11 + SHRQ $0x32, R11 SHLQ $0x08, R12 - IMULQ R9, R12 + IMULQ DI, R12 SHRQ $0x2f, R12 - IMULQ R10, R13 + IMULQ R8, R13 SHRQ $0x32, R13 - SHLQ $0x08, R14 - IMULQ R9, R14 - SHRQ $0x2f, R14 - IMULQ R10, R15 - SHRQ $0x32, R15 - LEAQ 1(DI), R10 - LEAQ 1(R11), BP - MOVL DI, (R8)(R12*4) - MOVL R11, (R8)(R14*4) - LEAQ 1(R11)(DI*1), R12 - SHRQ $0x01, R12 - ADDQ $0x01, DI - SUBQ $0x01, R11 - MOVL R10, 524288(R8)(R13*4) - MOVL BP, 524288(R8)(R15*4) + LEAQ 1(BX), R8 + LEAQ 1(R9), R14 + MOVL BX, (SI)(R10*4) + MOVL R9, (SI)(R12*4) + LEAQ 1(R9)(BX*1), R10 + SHRQ $0x01, R10 + ADDQ $0x01, BX + SUBQ $0x01, R9 + MOVL R8, 524288(SI)(R11*4) + MOVL R14, 524288(SI)(R13*4) index_loop_encodeBetterBlockAsm: - CMPQ R12, R11 + CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm - MOVQ (DX)(DI*1), R10 - MOVQ (DX)(R12*1), R13 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - SHLQ $0x08, R13 - IMULQ R9, R13 - SHRQ $0x2f, R13 - MOVL DI, (R8)(R10*4) - MOVL R11, (R8)(R13*4) - ADDQ $0x02, DI - ADDQ $0x02, R12 + MOVQ (DX)(BX*1), R8 + MOVQ (DX)(R10*1), R11 + SHLQ $0x08, R8 + IMULQ DI, R8 + SHRQ $0x2f, R8 + SHLQ $0x08, R11 + IMULQ DI, R11 + SHRQ $0x2f, R11 + MOVL BX, (SI)(R8*4) + MOVL R9, (SI)(R11*4) + ADDQ $0x02, BX + ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: @@ -9402,6 +9445,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (AX), SI @@ -9522,29 +9566,30 @@ TEXT ·encodeBetterBlockAsm2MB(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm2MB: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm2MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -17(AX), DX - LEAQ -17(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm2MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -17(AX), DX + LEAQ -17(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm2MB: MOVQ tmp+48(FP), BX @@ -9564,30 +9609,22 @@ check_maxskip_cont_encodeBetterBlockAsm2MB: JAE emit_remainder_encodeBetterBlockAsm2MB MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL (BX)(R10*4), SI - MOVL 524288(BX)(R11*4), R8 - MOVL AX, (BX)(R10*4) - MOVL AX, 524288(BX)(R11*4) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ DI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x2f, R9 + MOVL (BX)(R9*4), SI + MOVL AX, (BX)(R9*4) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm2MB - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm2MB LEAL 1(AX), BX MOVL 12(SP), SI @@ -9672,6 +9709,7 @@ one_byte_repeat_emit_encodeBetterBlockAsm2MB: CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -9890,28 +9928,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm2MB: repeat_end_emit_encodeBetterBlockAsm2MB: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm2MB + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm2MB: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm2MB - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm2MB - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm2MB - -candidateS_match_encodeBetterBlockAsm2MB: - SHRQ $0x08, DI + MOVQ $0x9e3779b1, R11 MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - MOVL (BX)(R10*4), SI - INCL AX - MOVL AX, (BX)(R10*4) - CMPL (DX)(SI*1), DI + IMULQ R11, R10 + SHRQ $0x32, R10 + MOVL 524288(BX)(R10*4), R11 + MOVL AX, 524288(BX)(R10*4) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI JEQ candidate_match_encodeBetterBlockAsm2MB - DECL AX - MOVL R8, SI + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm2MB + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm2MB + +candidateS_match_encodeBetterBlockAsm2MB: + SHRQ $0x08, DI + MOVQ DI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x2f, R9 + MOVL (BX)(R9*4), SI + INCL AX + MOVL AX, (BX)(R9*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm2MB + DECL AX + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm2MB: MOVL 12(SP), BX @@ -10200,6 +10247,7 @@ one_byte_match_emit_encodeBetterBlockAsm2MB: CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -10506,6 +10554,7 @@ one_byte_match_emit_repeat_encodeBetterBlockAsm2MB: CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -10795,6 +10844,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVQ (AX), SI @@ -10915,29 +10965,30 @@ TEXT ·encodeBetterBlockAsm512K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm512K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm512K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm512K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm512K: MOVQ tmp+48(FP), BX @@ -10957,30 +11008,22 @@ check_maxskip_cont_encodeBetterBlockAsm512K: JAE emit_remainder_encodeBetterBlockAsm512K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - IMULQ SI, R11 - SHRQ $0x33, R11 - MOVL (BX)(R10*4), SI - MOVL 262144(BX)(R11*4), R8 - MOVL AX, (BX)(R10*4) - MOVL AX, 262144(BX)(R11*4) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ DI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL (BX)(R9*4), SI + MOVL AX, (BX)(R9*4) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm512K - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm512K LEAL 1(AX), BX MOVL 12(SP), SI @@ -11072,6 +11115,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 @@ -11292,28 +11336,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm512K: repeat_end_emit_encodeBetterBlockAsm512K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm512K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm512K: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm512K - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm512K - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm512K - -candidateS_match_encodeBetterBlockAsm512K: - SHRQ $0x08, DI + MOVQ $0x9e3779b1, R11 MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL (BX)(R10*4), SI - INCL AX - MOVL AX, (BX)(R10*4) - CMPL (DX)(SI*1), DI + IMULQ R11, R10 + SHRQ $0x33, R10 + MOVL 262144(BX)(R10*4), R11 + MOVL AX, 262144(BX)(R10*4) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI JEQ candidate_match_encodeBetterBlockAsm512K - DECL AX - MOVL R8, SI + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm512K + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm512K + +candidateS_match_encodeBetterBlockAsm512K: + SHRQ $0x08, DI + MOVQ DI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL (BX)(R9*4), SI + INCL AX + MOVL AX, (BX)(R9*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm512K + DECL AX + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm512K: MOVL 12(SP), BX @@ -11609,6 +11662,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R10 @@ -11924,6 +11978,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 @@ -12215,6 +12270,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (AX), SI @@ -12335,29 +12391,30 @@ TEXT ·encodeBetterBlockAsm64K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm64K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm64K: MOVQ tmp+48(FP), BX @@ -12369,30 +12426,22 @@ search_loop_encodeBetterBlockAsm64K: JAE emit_remainder_encodeBetterBlockAsm64K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x31, R10 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVWLZX (BX)(R10*2), SI - MOVWLZX 65536(BX)(R11*2), R8 - MOVW AX, (BX)(R10*2) - MOVW AX, 65536(BX)(R11*2) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x31, R9 + MOVWLZX (BX)(R9*2), SI + MOVW AX, (BX)(R9*2) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm64K - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm64K LEAL 1(AX), BX MOVL 12(SP), SI @@ -12483,6 +12532,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 @@ -12703,28 +12753,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm64K: repeat_end_emit_encodeBetterBlockAsm64K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm64K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm64K: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm64K - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm64K - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm64K + MOVQ $0x9e3779b1, R11 + MOVQ DI, R10 + IMULQ R11, R10 + SHRQ $0x34, R10 + MOVWLZX 65536(BX)(R10*2), R11 + MOVW AX, 65536(BX)(R10*2) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI + JEQ candidate_match_encodeBetterBlockAsm64K + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm64K + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm64K candidateS_match_encodeBetterBlockAsm64K: SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x31, R10 - MOVWLZX (BX)(R10*2), SI + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x31, R9 + MOVWLZX (BX)(R9*2), SI INCL AX - MOVW AX, (BX)(R10*2) + MOVW AX, (BX)(R9*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm64K DECL AX - MOVL R8, SI + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm64K: MOVL 12(SP), BX @@ -12956,6 +13015,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R10 @@ -13227,6 +13287,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 @@ -13517,6 +13578,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (AX), SI @@ -13637,29 +13699,30 @@ TEXT ·encodeBetterBlockAsm16K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm16K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm16K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm16K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm16K: MOVQ tmp+48(FP), BX @@ -13671,30 +13734,22 @@ search_loop_encodeBetterBlockAsm16K: JAE emit_remainder_encodeBetterBlockAsm16K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - IMULQ SI, R11 - SHRQ $0x35, R11 - MOVWLZX (BX)(R10*2), SI - MOVWLZX 32768(BX)(R11*2), R8 - MOVW AX, (BX)(R10*2) - MOVW AX, 32768(BX)(R11*2) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVWLZX (BX)(R9*2), SI + MOVW AX, (BX)(R9*2) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm16K - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm16K LEAL 1(AX), BX MOVL 12(SP), SI @@ -13777,6 +13832,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 @@ -13997,28 +14053,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm16K: repeat_end_emit_encodeBetterBlockAsm16K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm16K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm16K: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm16K - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm16K - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm16K + MOVQ $0x9e3779b1, R11 + MOVQ DI, R10 + IMULQ R11, R10 + SHRQ $0x35, R10 + MOVWLZX 32768(BX)(R10*2), R11 + MOVW AX, 32768(BX)(R10*2) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI + JEQ candidate_match_encodeBetterBlockAsm16K + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm16K + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm16K candidateS_match_encodeBetterBlockAsm16K: SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVWLZX (BX)(R10*2), SI + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVWLZX (BX)(R9*2), SI INCL AX - MOVW AX, (BX)(R10*2) + MOVW AX, (BX)(R9*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm16K DECL AX - MOVL R8, SI + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm16K: MOVL 12(SP), BX @@ -14242,6 +14307,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R10 @@ -14505,6 +14571,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 @@ -14787,6 +14854,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (AX), SI @@ -14907,29 +14975,30 @@ TEXT ·encodeBetterBlockAsm4K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm4K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm4K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm4K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm4K: MOVQ tmp+48(FP), BX @@ -14941,30 +15010,22 @@ search_loop_encodeBetterBlockAsm4K: JAE emit_remainder_encodeBetterBlockAsm4K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVWLZX (BX)(R10*2), SI - MOVWLZX 8192(BX)(R11*2), R8 - MOVW AX, (BX)(R10*2) - MOVW AX, 8192(BX)(R11*2) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVWLZX (BX)(R9*2), SI + MOVW AX, (BX)(R9*2) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm4K - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm4K LEAL 1(AX), BX MOVL 12(SP), SI @@ -15047,6 +15108,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 @@ -15267,28 +15329,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm4K: repeat_end_emit_encodeBetterBlockAsm4K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm4K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm4K: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm4K - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm4K - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm4K + MOVQ $0x9e3779b1, R11 + MOVQ DI, R10 + IMULQ R11, R10 + SHRQ $0x36, R10 + MOVWLZX 8192(BX)(R10*2), R11 + MOVW AX, 8192(BX)(R10*2) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI + JEQ candidate_match_encodeBetterBlockAsm4K + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm4K + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm4K candidateS_match_encodeBetterBlockAsm4K: SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVWLZX (BX)(R10*2), SI + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVWLZX (BX)(R9*2), SI INCL AX - MOVW AX, (BX)(R10*2) + MOVW AX, (BX)(R9*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4K DECL AX - MOVL R8, SI + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm4K: MOVL 12(SP), BX @@ -15512,6 +15583,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R10 @@ -15775,6 +15847,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 @@ -16057,6 +16130,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (AX), SI @@ -16177,29 +16251,30 @@ TEXT ·encodeBetterBlockAsm1K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm1K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm1K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm1K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm1K: MOVQ tmp+48(FP), BX @@ -16211,30 +16286,22 @@ search_loop_encodeBetterBlockAsm1K: JAE emit_remainder_encodeBetterBlockAsm1K MOVQ (DX)(AX*1), DI MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x35, R10 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVWLZX (BX)(R10*2), SI - MOVWLZX 4096(BX)(R11*2), R8 - MOVW AX, (BX)(R10*2) - MOVW AX, 4096(BX)(R11*2) - MOVQ (DX)(SI*1), R10 - CMPQ R10, DI + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x35, R9 + MOVWLZX (BX)(R9*2), SI + MOVW AX, (BX)(R9*2) + MOVQ (DX)(SI*1), R9 + CMPQ R9, DI JEQ candidate_match_encodeBetterBlockAsm1K - MOVQ (DX)(R8*1), R11 - CMPQ R11, DI - MOVL AX, R12 - SUBL 16(SP), R12 - MOVQ (DX)(R12*1), R12 - MOVQ $0x000000ffffffff00, R13 - XORQ DI, R12 - TESTQ R13, R12 + MOVL AX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R10 + MOVQ $0x000000ffffffff00, R11 + XORQ DI, R10 + TESTQ R11, R10 JNE no_repeat_found_encodeBetterBlockAsm1K LEAL 1(AX), BX MOVL 12(SP), SI @@ -16317,6 +16384,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 @@ -16537,28 +16605,37 @@ repeat_one_match_repeat_encodeBetterBlockAsm1K: repeat_end_emit_encodeBetterBlockAsm1K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm1K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm1K: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm1K - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm1K - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm1K + MOVQ $0x9e3779b1, R11 + MOVQ DI, R10 + IMULQ R11, R10 + SHRQ $0x38, R10 + MOVWLZX 4096(BX)(R10*2), R11 + MOVW AX, 4096(BX)(R10*2) + MOVQ (DX)(R11*1), R10 + CMPL R9, DI + JEQ candidate_match_encodeBetterBlockAsm1K + CMPL R10, DI + JEQ candidateS_match_encodeBetterBlockAsm1K + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm1K candidateS_match_encodeBetterBlockAsm1K: SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x35, R10 - MOVWLZX (BX)(R10*2), SI + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x35, R9 + MOVWLZX (BX)(R9*2), SI INCL AX - MOVW AX, (BX)(R10*2) + MOVW AX, (BX)(R9*2) CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm1K DECL AX - MOVL R8, SI + MOVL R11, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm1K: MOVL 12(SP), BX @@ -16782,6 +16859,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R10 @@ -17045,6 +17123,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 @@ -17327,6 +17406,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (AX), SI @@ -17524,6 +17604,7 @@ emit_lit_memmove_standalone_memmove_move_4through8: MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone + PCALIGN $0x10 emit_lit_memmove_standalone_memmove_move_8through16: MOVQ (CX), SI @@ -18215,6 +18296,7 @@ emit_lit_memmove_lz4_mz_memmove_move_4through8: MOVL R13, (AX) MOVL R14, -4(AX)(R12*1) JMP memmove_end_copy_lz4_mz + PCALIGN $0x10 emit_lit_memmove_lz4_mz_memmove_move_8through16: MOVQ (DX), R13 @@ -18528,6 +18610,7 @@ emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8: MOVL R12, (AX) MOVL DI, -4(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy + PCALIGN $0x10 emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16: MOVQ (DI), R12 @@ -18912,6 +18995,7 @@ emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8: MOVL BX, (AX) MOVL DI, -4(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final + PCALIGN $0x10 emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16: MOVQ (DI), BX @@ -19056,12 +19140,14 @@ TEXT ·decodeBlockAsm(SB), $8-56 MOVBQZX (R8), R10 MOVQ R10, R11 SHRQ $0x02, R11 + PCALIGN $0x10 decodeBlockAsm_fast_loop_nofetch: - CMPQ SI, BX - JAE decodeBlockAsm_fast_end_copy - ANDQ $0x03, R10 - JNZ decodeBlockAsm_fast_copy + CMPQ SI, BX + JAE decodeBlockAsm_fast_end_copy + ANDQ $0x03, R10 + JNZ decodeBlockAsm_fast_copy + PCALIGN $0x10 decodeBlockAsm_fast_lits: MOVL R11, R12 @@ -19072,6 +19158,7 @@ decodeBlockAsm_fast_lits: CMPL R12, $0x1e JEQ decodeBlockAsm_fast_lit_2 JMP decodeBlockAsm_fast_lit_3 + PCALIGN $0x10 decodeBlockAsm_fast_lit_0: INCQ R8 @@ -19092,6 +19179,7 @@ decodeBlockAsm_fast_lit_0: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32 JMP decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16: MOVOU (R8), X0 @@ -19229,6 +19317,7 @@ decodeBlockAsm_fast_copy: JB decodeBlockAsm_fast_copy_1 JEQ decodeBlockAsm_fast_copy_2 JMP decodeBlockAsm_fast_copy_3 + PCALIGN $0x10 decodeBlockAsm_fast_copy_1: MOVWQZX R13, R9 @@ -19246,6 +19335,7 @@ decodeBlockAsm_fast_copy_1: CMOVLEQ R11, R12 CMOVQEQ R10, R8 JMP decodeBlockAsm_fast_copy_exec + PCALIGN $0x10 decodeBlockAsm_fast_copy_2: MOVQ R11, R12 @@ -19287,6 +19377,7 @@ decodeBlockAsm_fast_copy_2_0_extra: LEAL 4(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_short_no_ol + PCALIGN $0x10 decodeBlockAsm_fast_copy_3: MOVL R13, R9 @@ -19356,6 +19447,7 @@ decodeBlockAsm_fast_copy_fused_long: ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_fast_copy_exec_long_long + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec_short: CMPL R9, DI @@ -19371,6 +19463,7 @@ decodeBlockAsm_fast_copy_exec_short: CMPL R9, R12 JB decodeBlockAsm_fast_copy_overlap JMP decodeBlockAsm_fast_copy_short + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec_long_long: MOVQ SI, R11 @@ -19407,6 +19500,7 @@ decodeBlockAsm_fast_copy_long_longlarge_big_loop_back: MOVOU X0, -32(R14)(R15*1) MOVOU X1, -16(R14)(R15*1) JMP decodeBlockAsm_fast_copy_done + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_no_ol: MOVQ SI, R11 @@ -19427,6 +19521,7 @@ decodeBlockAsm_fast_copy_short_no_ol: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16: MOVOU (R11), X0 @@ -19450,6 +19545,7 @@ decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64: MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec: CMPL R9, DI @@ -19475,6 +19571,7 @@ decodeBlockAsm_fast_copy_short: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_memmove_move_8through16: MOVOU (R11), X0 @@ -19642,6 +19739,7 @@ decodeBlockAsm_remain_loop: JAE decodeBlockAsm_remain_end_copy ANDQ $0x03, DX JNZ decodeBlockAsm_remain_copy + PCALIGN $0x10 decodeBlockAsm_remain_lits: MOVL BX, DX @@ -19652,6 +19750,7 @@ decodeBlockAsm_remain_lits: CMPL DX, $0x1e JEQ decodeBlockAsm_remain_lit_2 JMP decodeBlockAsm_remain_lit_3 + PCALIGN $0x10 decodeBlockAsm_remain_lit_0: INCQ R8 @@ -19698,6 +19797,7 @@ decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8: MOVL BX, (SI) MOVL R10, -4(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done + PCALIGN $0x10 decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16: MOVQ (R8), BX @@ -19842,6 +19942,7 @@ decodeBlockAsm_remain_copy: JB decodeBlockAsm_remain_copy_1 JEQ decodeBlockAsm_remain_copy_2 JMP decodeBlockAsm_remain_copy_3 + PCALIGN $0x10 decodeBlockAsm_remain_copy_1: ADDQ $0x02, R8 @@ -19864,6 +19965,7 @@ decodeBlockAsm_remain_copy_1: decodeBlockAsm_remain_copy_1_short: LEAL 4(DX), DX JMP decodeBlockAsm_remain_copy_exec_short + PCALIGN $0x10 decodeBlockAsm_remain_copy_2: MOVQ BX, DX @@ -19910,6 +20012,7 @@ decodeBlockAsm_remain_copy_2_0_extra: LEAL 4(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_short_no_ol + PCALIGN $0x10 decodeBlockAsm_remain_copy_3: ADDQ $0x04, R8 @@ -20071,6 +20174,7 @@ decodeBlockAsm_remain_copy3_fused_lits_done: ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_remain_copy_exec_long_long + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec_short: CMPL R9, DI @@ -20083,6 +20187,7 @@ decodeBlockAsm_remain_copy_exec_short: CMPL R9, DX JB decodeBlockAsm_remain_copy_overlap JMP decodeBlockAsm_remain_copy_short + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec_long_long: MOVQ SI, BX @@ -20116,6 +20221,7 @@ decodeBlockAsm_remain_copy_long_longlarge_big_loop_back: MOVOU X0, -32(R11)(R12*1) MOVOU X1, -16(R11)(R12*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_no_ol: MOVQ SI, BX @@ -20142,6 +20248,7 @@ decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8: MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16: MOVQ (BX), R10 @@ -20167,6 +20274,7 @@ decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64: MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec: CMPL R9, DI @@ -20215,6 +20323,7 @@ decodeBlockAsm_remain_copy_short_memmove_move_4through8: MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_memmove_move_8through16: MOVQ (BX), R10