Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 116 additions & 38 deletions _generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ import (
"github.com/mmcloughlin/avo/reg"
)

// insert extra checks here and there.
const debug = false
const (
// insert extra checks here and there.
debug = false
// matchOffsetCMOV is true if we should use CMOV to check match offsets.
matchOffsetCMOV = true
)

func main() {
flag.Parse()
Expand Down Expand Up @@ -180,6 +184,20 @@ func (r regTable) LoadIdx(idx, dst reg.GPVirtual) {
}
}

// Pretty bad performance.
func (r regTable) XchIdx(idx, val reg.GPVirtual) {
switch r.scale {
case 1:
XCHGB(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As8())
case 2:
XCHGW(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As16())
case 4:
XCHGL(Mem{Base: r.r, Index: idx, Scale: r.scale, Disp: r.disp}, val.As32())
default:
panic(r.scale)
}
}

func (r regTable) SaveIdx(val, idx reg.GPVirtual) {
switch r.scale {
case 1:
Expand Down Expand Up @@ -383,6 +401,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
Load(Param("src").Base(), src)

// Load cv
PCALIGN(16)
Label("search_loop_" + name)
candidate := GP32()
{
Expand Down Expand Up @@ -425,12 +444,20 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
ifok()
return
}
skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name)
ccCounter++
CMPL(cand.As32(), minPos.As32())
JLE(LabelRef(skip))
ifok()
Label(skip)
if matchOffsetCMOV {
// Use CMOV over JLE to avoid a jump.
// Intel seems to favor this.
CMPL(cand.As32(), minPos.As32())
CMOVLLE(minPos.As32(), cand.As32())
ifok()
} else {
skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name)
ccCounter++
CMPL(cand.As32(), minPos.As32())
JLE(LabelRef(skip))
ifok()
Label(skip)
}
}
assert(func(ok LabelRef) {
// Check if s is valid (we should have jumped above if not)
Expand Down Expand Up @@ -600,6 +627,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
}
JMP(LabelRef("search_loop_" + name))
}
PCALIGN(16)
Label("no_repeat_found_" + name)
{
// Check candidates are ok. All must be < s and < len(src)
Expand Down Expand Up @@ -658,12 +686,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
JEQ(LabelRef("candidate2_match_" + name))
})

// table[hash2] = uint32(s + 2)
table.SaveIdx(tmp, hash2)

// cv >>= 8 (>> 16 total)
SHRQ(U8(8), cv)

// table[hash2] = uint32(s + 2)
table.SaveIdx(tmp, hash2)

// if uint32(cv>>16) == load32(src, candidate)
checkCandidate(candidate, func() {
CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
Expand All @@ -690,6 +718,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
}
}

PCALIGN(16)
Label("candidate_match_" + name)
// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
// Extend backwards
Expand Down Expand Up @@ -829,6 +858,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
o.emitLiteral("match_emit_"+name, litLen, nil, dst, litSrc, LabelRef("match_nolits_copy_"+name), true)
}

PCALIGN(16)
Label("match_nolits_copy_" + name)
o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
Label("match_nolit_emitcopy_end_" + name)
Expand Down Expand Up @@ -1167,6 +1197,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
Load(Param("src").Base(), src)

// Load cv
PCALIGN(16)
Label("search_loop_" + name)
reloadTables("tmp", &sTab, &lTab)
candidate := GP32()
Expand Down Expand Up @@ -1226,28 +1257,24 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
// move nextS to stack.
MOVL(nextS.As32(), nextSTempL)

candidateS := GP32()
lHasher := hashN(o, lHashBytes, lTableBits)
{
sHasher := hashN(o, sHashBytes, sTableBits)
hash0, hash1 := GP64(), GP64()
hash0 := GP64()
MOVQ(cv, hash0)
MOVQ(cv, hash1)
lHasher.hash(hash0)
sHasher.hash(hash1)
lTab.LoadIdx(hash0, candidate)
sTab.LoadIdx(hash1, candidateS)
assert(func(ok LabelRef) {
CMPQ(hash0, U32(lTableSize))
JB(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash1, U32(sTableSize))
JB(ok)
})

lTab.SaveIdx(s, hash0)
sTab.SaveIdx(s, hash1)
if false {
MOVL(s, candidate)
lTab.XchIdx(hash0, candidate)
} else {
// Load candidate from lTab
lTab.LoadIdx(hash0, candidate)
// Store s in lTab
lTab.SaveIdx(s, hash0)
}
}
// Check if offset exceeds max
var ccCounter int
Expand All @@ -1261,12 +1288,20 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
ifok()
return
}
skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name)
ccCounter++
CMPL(cand.As32(), minPos.As32())
JLE(LabelRef(skip))
ifok()
Label(skip)
if matchOffsetCMOV {
// Use CMOV over JLE to avoid a jump.
// Intel seems to favor this.
CMPL(cand.As32(), minPos.As32())
CMOVLLE(minPos.As32(), cand.As32())
ifok()
} else {
skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name)
ccCounter++
CMPL(cand.As32(), minPos.As32())
JL(LabelRef(skip))
ifok()
Label(skip)
}
}
longVal := GP64()
shortVal := GP64()
Expand All @@ -1278,12 +1313,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
JEQ(LabelRef("candidate_match_" + name))
})

// Load short early...
checkCandidate(candidateS, func() {
MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal)
CMPQ(shortVal, cv.As64())
})

// En/disable repeat matching.
if true {
// Check repeat at offset checkRep
Expand Down Expand Up @@ -1419,7 +1448,30 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
}
JMP(LabelRef("search_loop_" + name))
}
PCALIGN(16)
Label("no_repeat_found_" + name)
candidateS := GP32()
{
sHasher := hashN(o, sHashBytes, sTableBits)
hash1 := GP64()
MOVQ(cv, hash1)
sHasher.hash(hash1)
assert(func(ok LabelRef) {
CMPQ(hash1, U32(sTableSize))
JB(ok)
})
if false {
MOVL(s.As32(), candidateS)
sTab.XchIdx(hash1, candidateS)
} else {
// Load candidateS from sTab
sTab.LoadIdx(hash1, candidateS)
// Store s in sTab
sTab.SaveIdx(s, hash1)
}
// Load short early...
MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal)
}
{
// Check candidates are ok. All must be < s and < len(src)
assert(func(ok LabelRef) {
Expand Down Expand Up @@ -1484,6 +1536,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
}
}

PCALIGN(16)
Label("candidate_match_" + name)
// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
// Extend backwards
Expand Down Expand Up @@ -2777,6 +2830,7 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en
}

if minMove <= 16 {
PCALIGN(16)
Label(name + "move_8through16")
if margin < 16 {
MOVQ(Mem{Base: src}, AX)
Expand Down Expand Up @@ -3555,14 +3609,21 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
}

// LOOP
if !prefetch {
// Triggers https://github.com/golang/go/issues/74648
//PCALIGN(16)
}
Label(name + "_loop")
CMPQ(src, srcLimit)
JAE(LabelRef(name + "_end_copy"))
MOVBQZX(Mem{Base: src}, tag)
MOVQ(tag, value)
SHRQ(U8(2), value)

Label(name + "_loop_nofetch")
if prefetch {
PCALIGN(16)
Label(name + "_loop_nofetch")
}
// Check destination
CMPQ(dst, dstLimit)
JAE(LabelRef(name + "_end_copy"))
Expand All @@ -3573,6 +3634,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
JNZ(LabelRef(name + "_copy"))
// TAG 00 Literals
length := GP64()
PCALIGN(16)
Label(name + "_lits")
{
MOVL(value.As32(), length.As32())
Expand All @@ -3585,6 +3647,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
JMP(LabelRef(name + "_lit_3")) // Must be 31

// 1 - > 29 literals
PCALIGN(16)
Label(name + "_lit_0")
{
INCQ(src)
Expand Down Expand Up @@ -3709,6 +3772,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
JMP(LabelRef(name + "_copy_3"))

// TAG 1 - Copy 1
PCALIGN(16)
Label(name + "_copy_1")
{
if o.inputMargin < 2 {
Expand Down Expand Up @@ -3771,6 +3835,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
}

// TAG 2 - Copy 2
PCALIGN(16)
Label(name + "_copy_2")
{
// length = int(src[s-3]) >> 2
Expand Down Expand Up @@ -3862,6 +3927,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
}
}
// TAG 3 - Copy 2/3 fused
PCALIGN(16)
Label(name + "_copy_3")
{
if o.inputMargin < 4 {
Expand Down Expand Up @@ -4048,6 +4114,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
}
// Length always < 64
copySrc := GP64()
PCALIGN(16)
Label(name + "_copy_exec_short")
{
CMPL(offset.As32(), dstPos.As32())
Expand All @@ -4067,6 +4134,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
o.outputMargin -= 4

// 64 offset, 64 length
PCALIGN(16)
Label(name + "_copy_exec_long_long")
{
MOVQ(dst, copySrc)
Expand All @@ -4080,8 +4148,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
}
o.genMemMoveLong64(name+"_copy_long_long", dst, copySrc, length, LabelRef(name+"_copy_done"))
}

// length 4 -> 64, no overlap
// Very hot (16 byte copy mainly)
PCALIGN(16)
Label(name + "_copy_short_no_ol")
{
// Create source pointer with offset
Expand All @@ -4097,6 +4167,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
o.genMemMoveShort(name+"_copy_short_no_ol", dst, copySrc, length, LabelRef(name+"_copy_done"), 4)
}
// Offset anything, length anything
PCALIGN(16)
Label(name + "_copy_exec")
{
CMPL(offset.As32(), dstPos.As32())
Expand Down Expand Up @@ -4273,3 +4344,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr
Label(name + "_end_copy")
Label(name + "_end_done")
}

func PCALIGN(n int) {
Instruction(&ir.Instruction{
Opcode: "PCALIGN",
Operands: []Op{Imm(uint64(n))},
})
}
Loading
Loading