Skip to content

Small tuning for write-barrier on arm64 #106191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions src/coreclr/nativeaot/Runtime/arm64/WriteBarriers.S
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,10 @@
2:
// We can skip the card table write if the reference is to
// an object not on the epehemeral segment.
PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_low, x12
cmp \refReg, x12
blo 0f

PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_high, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_low, x12
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw you have used ldp at one point. Is it not profitable here?

Copy link
Member Author

@EgorBo EgorBo Aug 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw no visible benefits from it, while it's possible to use ldp for NativeAOT easily, it's a bit tricky on CoreCLR with its:

    ldr  x12, LOCAL_LABEL(wbs_ephemeral_low)
    ldr  x17, LOCAL_LABEL(wbs_ephemeral_high)

I wasn't sure how to properly use ldp with LOCAL_LABEL 🤔
I tried:

    ldp  x12, x17, LOCAL_LABEL(wbs_ephemeral_low) -- not compiling
    ldp  x12, x17, [sp, LOCAL_LABEL(wbs_ephemeral_low)] -- not compiling

I guess ldp expects some real sp delta rather than label imm. I guess I can calculate & hard-code it, but I simply didn't see any improvements in the benchmarks from that so I gave up

From my understanding, the difference between CoreCLR and NativeAOT is that NativeAOT accesses GC variables directly, while on CoreCLR we have a mechanism that dumps them all to a "patchable pool" next to the Write-barrier function for better cache access so it can access them by label in code

PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_high, x17
cmp \refReg, x12
ccmp \refReg, x17, #0x2, hs
bhs 0f

// Set this objects card, if it has not already been set.
Expand Down Expand Up @@ -222,11 +220,10 @@ LEAF_END RhpByRefAssignRefArm64, _TEXT
LEAF_ENTRY RhpCheckedAssignRefArm64, _TEXT

// is destReg within the heap?
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x17
cmp x14, x12

PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x12
ccmp x14, x12, #0x2, hs
ccmp x14, x17, #0x2, hs
bhs LOCAL_LABEL(NotInHeap)

b C_FUNC(RhpAssignRefArm64)
Expand Down
25 changes: 9 additions & 16 deletions src/coreclr/nativeaot/Runtime/arm64/WriteBarriers.asm
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,10 @@ INVALIDGCVALUE EQU 0xCCCCCCCD
2
;; We can skip the card table write if the reference is to
;; an object not on the epehemeral segment.
PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_low, x12
cmp $refReg, x12
blo %ft0

PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_high, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_low, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_ephemeral_high, x17
cmp $refReg, x12
ccmp $refReg, x17, #0x2, hs
bhs %ft0

;; Set this object's card, if it hasn't already been set.
Expand Down Expand Up @@ -179,14 +177,10 @@ INVALIDGCVALUE EQU 0xCCCCCCCD

;; The "check" of this checked write barrier - is $destReg
;; within the heap? if no, early out.
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x17
cmp $destReg, x12

PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x12

;; If $destReg >= g_lowest_address, compare $destReg to g_highest_address.
;; Otherwise, set the C flag (0x2) to take the next branch.
ccmp $destReg, x12, #0x2, hs
ccmp $destReg, x17, #0x2, hs
bhs %ft0

INSERT_UNCHECKED_WRITE_BARRIER_CORE $destReg, $refReg
Expand Down Expand Up @@ -237,11 +231,10 @@ INVALIDGCVALUE EQU 0xCCCCCCCD
LEAF_ENTRY RhpCheckedAssignRefArm64

;; is destReg within the heap?
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_lowest_address, x12
PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x17
cmp x14, x12

PREPARE_EXTERNAL_VAR_INDIRECT g_highest_address, x12
ccmp x14, x12, #0x2, hs
ccmp x14, x17, #0x2, hs
blo RhpAssignRefArm64

NotInHeap
Expand Down
75 changes: 32 additions & 43 deletions src/coreclr/vm/arm64/patchedcode.S
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ LEAF_ENTRY JIT_PatchedCodeStart, _TEXT
ret lr
LEAF_END JIT_PatchedCodeStart, _TEXT

//-----------------------------------------------------------------------------
// void JIT_ByRefWriteBarrier
// On entry:
// x13 : the source address (points to object reference to write)
Expand All @@ -40,7 +41,7 @@ LEAF_END JIT_PatchedCodeStart, _TEXT
// x13 : incremented by 8
// x14 : incremented by 8
// x15 : trashed
// x17 : trashed (ip1) if FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
// x17 : trashed (ip1)
//
// NOTE: Keep in sync with RBM_CALLEE_TRASH_WRITEBARRIER_BYREF and RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF
// if you add more trashed registers.
Expand All @@ -63,20 +64,16 @@ WRITE_BARRIER_END JIT_ByRefWriteBarrier
// x12 : trashed
// x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
// x15 : trashed
// x17 : trashed (ip1) if FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
// x17 : trashed (ip1)
//
// NOTE: Keep in sync with RBM_CALLEE_TRASH_WRITEBARRIER_BYREF and RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF
// if you add more trashed registers.
//
WRITE_BARRIER_ENTRY JIT_CheckedWriteBarrier
ldr x12, LOCAL_LABEL(wbs_lowest_address)
ldr x17, LOCAL_LABEL(wbs_highest_address)
cmp x14, x12

ldr x12, LOCAL_LABEL(wbs_highest_address)

// Compare against the upper bound if the previous comparison indicated
// that the destination address is greater than or equal to the lower
// bound. Otherwise, set the C flag (specified by the 0x2) so that the
// branch below is not taken.
ccmp x14, x12, #0x2, hs

ccmp x14, x17, #0x2, hs
bhs LOCAL_LABEL(NotInHeap)

b C_FUNC(JIT_WriteBarrier)
Expand All @@ -86,6 +83,7 @@ LOCAL_LABEL(NotInHeap):
ret lr
WRITE_BARRIER_END JIT_CheckedWriteBarrier

//-----------------------------------------------------------------------------
// void JIT_WriteBarrier(Object** dst, Object* src)
// On entry:
// x14 : the destination address (LHS of the assignment)
Expand All @@ -95,7 +93,10 @@ WRITE_BARRIER_END JIT_CheckedWriteBarrier
// x12 : trashed
// x14 : trashed (incremented by 8 to implement JIT_ByRefWriteBarrier contract)
// x15 : trashed
// x17 : trashed (ip1) if FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
// x17 : trashed (ip1)
//
// NOTE: Keep in sync with RBM_CALLEE_TRASH_WRITEBARRIER_BYREF and RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF
// if you add more trashed registers.
//
WRITE_BARRIER_ENTRY JIT_WriteBarrier
stlr x15, [x14]
Expand All @@ -105,20 +106,17 @@ WRITE_BARRIER_ENTRY JIT_WriteBarrier

// Do not perform the work if g_GCShadow is 0
ldr x12, LOCAL_LABEL(wbs_GCShadow)
cbz x12, LOCAL_LABEL(ShadowUpdateDisabled)

// need temporary register. Save before using.
str x13, [sp, #-16]!
cbz x12, LOCAL_LABEL(ShadowUpdateEnd)

// Compute address of shadow heap location:
// pShadow = g_GCShadow + (x14 - g_lowest_address)
ldr x13, LOCAL_LABEL(wbs_lowest_address)
sub x13, x14, x13
add x12, x13, x12
ldr x17, LOCAL_LABEL(wbs_lowest_address)
sub x17, x14, x17
add x12, x17, x12

// if (pShadow >= g_GCShadowEnd) goto end
ldr x13, LOCAL_LABEL(wbs_GCShadowEnd)
cmp x12, x13
ldr x17, LOCAL_LABEL(wbs_GCShadowEnd)
cmp x12, x17
bhs LOCAL_LABEL(ShadowUpdateEnd)

// *pShadow = x15
Expand All @@ -129,18 +127,16 @@ WRITE_BARRIER_ENTRY JIT_WriteBarrier
dmb ish

// if ([x14] == x15) goto end
ldr x13, [x14]
cmp x13, x15
ldr x17, [x14]
cmp x17, x15
beq LOCAL_LABEL(ShadowUpdateEnd)

// *pShadow = INVALIDGCVALUE (0xcccccccd)
movz x13, #0xcccd
movk x13, #0xcccc, LSL #16
str x13, [x12]
movz x17, #0xcccd
movk x17, #0xcccc, LSL #16
str x17, [x12]

LOCAL_LABEL(ShadowUpdateEnd):
ldr x13, [sp], #16
LOCAL_LABEL(ShadowUpdateDisabled):
#endif

#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
Expand All @@ -156,30 +152,20 @@ LOCAL_LABEL(ShadowUpdateDisabled):

LOCAL_LABEL(CheckCardTable):
// Branch to Exit if the reference is not in the Gen0 heap
//
ldr x12, LOCAL_LABEL(wbs_ephemeral_low)
cbz x12, LOCAL_LABEL(SkipEphemeralCheck)
cmp x15, x12

ldr x12, LOCAL_LABEL(wbs_ephemeral_high)

// Compare against the upper bound if the previous comparison indicated
// that the destination address is greater than or equal to the lower
// bound. Otherwise, set the C flag (specified by the 0x2) so that the
// branch to exit is taken.
ccmp x15, x12, #0x2, hs

ldr x17, LOCAL_LABEL(wbs_ephemeral_high)
cmp x15, x12
ccmp x15, x17, #0x2, hs
bhs LOCAL_LABEL(Exit)

LOCAL_LABEL(SkipEphemeralCheck):
// Check if we need to update the card table
ldr x12, LOCAL_LABEL(wbs_card_table)
add x15, x12, x14, lsr #11
ldrb w12, [x15]
cmp x12, 0xFF
beq LOCAL_LABEL(Exit)

LOCAL_LABEL(UpdateCardTable):
// Update the card table
mov x12, 0xFF
strb w12, [x15]

Expand All @@ -191,12 +177,15 @@ LOCAL_LABEL(UpdateCardTable):
cmp x12, 0xFF
beq LOCAL_LABEL(Exit)

LOCAL_LABEL(UpdateCardBundle):
// Update the card bundle
mov x12, 0xFF
strb w12, [x15]
#endif

LOCAL_LABEL(Exit):
// Increment by 8 to implement JIT_ByRefWriteBarrier contract.
// TODO: Consider duplicating the logic to get rid of this redundant 'add'
// for JIT_WriteBarrier/JIT_CheckedWriteBarrier
add x14, x14, 8
ret lr
WRITE_BARRIER_END JIT_WriteBarrier
Expand Down
Loading
Loading