Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,35 @@ void init_vector_mem_state(amdgpu::Wavefront &wf, amdgpu::VectorMemState &d) {
d.cu_path = wf.cu().full_path();
}

bool decode_flat_private_address(amdgpu::Wavefront &wf, uint64_t addr, uint64_t *translated) {
uint32_t lane_stride = wf.scratch_lane_size();
if (lane_stride == 0)
return false;

uint32_t wf_size = wf.wf_size();
assert(wf_size == 32 || wf_size == 64);
uint32_t lane_shift = wf_size == 64 ? 51 : 52;
uint64_t lane_mask = static_cast<uint64_t>(wf_size - 1) << lane_shift;
uint64_t scratch_base = wf.scratch_base();
uint64_t base_without_lane = scratch_base & ~lane_mask;
uint64_t addr_without_lane = addr & ~lane_mask;
if (addr_without_lane < base_without_lane)
return false;

uint64_t private_offset = addr_without_lane - base_without_lane;
if (private_offset > 0xFFFF'FFFFULL)
return false;

if (translated != nullptr) {
uint32_t encoded_lane = static_cast<uint32_t>((addr & lane_mask) >> lane_shift);
*translated = scratch_base + static_cast<uint64_t>(encoded_lane) * lane_stride + private_offset;
}
return true;
}

template <typename Inst>
void flat_global_calculate_addresses(const Inst &inst, amdgpu::Wavefront &wf,
amdgpu::VectorMemState &d) {
amdgpu::VectorMemState &d, bool decode_flat_private) {
auto &cu = wf.cu();
init_vector_mem_state(wf, d);
uint64_t exec = d.exec_mask;
Expand All @@ -106,10 +132,13 @@ void flat_global_calculate_addresses(const Inst &inst, amdgpu::Wavefront &wf,
(static_cast<uint64_t>(cu.read_vgpr(vbase + 1, lane)) << 32) | cu.read_vgpr(vbase, lane);
}
uint64_t addr = saddr_val + vaddr + offset;
uint32_t priv_hi = static_cast<uint32_t>(wf.private_aperture_base() >> 32);
if (priv_hi != 0 && static_cast<uint32_t>(addr >> 32) == priv_hi) {
uint64_t lane_base = wf.scratch_base() + static_cast<uint64_t>(lane) * wf.scratch_lane_size();
addr = lane_base + (addr & 0xFFFFFFFFULL);
if (decode_flat_private) {
uint64_t translated = 0;
if (decode_flat_private_address(wf, addr, &translated))
addr = translated;
} else {
assert(!decode_flat_private_address(wf, addr, nullptr) &&
"gfx1250 global memory address must not use flat private scratch encoding");
}
d.per_lane_addr[lane] = addr;
}
Expand All @@ -136,12 +165,12 @@ uint64_t smem_calculate_address(const SmemMachineInst &inst, amdgpu::Wavefront &

void flat_calculate_addresses(const VflatMachineInst &inst, amdgpu::Wavefront &wf,
amdgpu::VectorMemState &d) {
flat_global_calculate_addresses(inst, wf, d);
flat_global_calculate_addresses(inst, wf, d, true);
}

void flat_calculate_addresses(const VglobalMachineInst &inst, amdgpu::Wavefront &wf,
amdgpu::VectorMemState &d) {
flat_global_calculate_addresses(inst, wf, d);
flat_global_calculate_addresses(inst, wf, d, false);
}

void flat_calculate_addresses(const VscratchMachineInst &inst, amdgpu::Wavefront &wf,
Expand Down
57 changes: 57 additions & 0 deletions emulation/rocjitsu/tests/shared_infra_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "rocjitsu/isa/arch/amdgpu/gfx1250/addr_calc.h"
#include "rocjitsu/isa/arch/amdgpu/gfx1250/isa.h"
#include "rocjitsu/isa/arch/amdgpu/gfx1250/machine_insts.h"
#include "rocjitsu/isa/arch/amdgpu/gfx1250/operand.h"
#include "rocjitsu/isa/arch/amdgpu/gfx1250/operand_types.h"
#include "rocjitsu/isa/arch/amdgpu/rdna2/isa.h"
#include "rocjitsu/isa/arch/amdgpu/rdna3/addr_calc.h"
Expand Down Expand Up @@ -1244,6 +1245,62 @@ TEST(RdnaAddrCalcTest, Rdna4Saddr7cCoversGlobalFlatAndScratch) {
EXPECT_EQ(d.per_lane_addr[1], 0x6'0000'7110ULL);
}

TEST(Gfx1250AddrCalcTest, FlatPrivateScratchDecodesLaneBits) {
amdgpu::GpuMemory mem("gfx1250_flat_private_mem");
amdgpu::L2Cache l2("gfx1250_flat_private_l2");
amdgpu::ComputeUnitCore::Config cfg{};
cfg.arch = ROCJITSU_CODE_ARCH_GFX1250;
cfg.num_wf_slots = 1;
cfg.sgprs_per_wf = 128;
cfg.vgprs_per_wf = 32;
cfg.lds_size_kb = 64;
auto cu = amdgpu::ComputeUnitCore::create("gfx1250_flat_private_cu", cfg, &mem, &l2);
ASSERT_NE(cu, nullptr);

auto *wf = cu->dispatch_wf(0, 0, 128, 32);
ASSERT_NE(wf, nullptr);
ASSERT_EQ(wf->wf_size(), 32u);
wf->set_exec(0x7ULL);

constexpr uint64_t kScratchBase = 0x0002'0000'0000'0000ULL;
constexpr uint64_t kPrivateBase = 0x0007'0000'0000'0000ULL;
constexpr uint32_t kPrivateSegmentSize = 0x80;
wf->set_scratch_base(kScratchBase);
wf->set_scratch_lane_size(kPrivateSegmentSize);
wf->set_apertures(0x0001'0000'0000'0000ULL, 0x0001'0000'ffff'ffffULL, kPrivateBase,
kPrivateBase + 0xffff'ffffULL);

gfx1250::Operand flat_scratch_base(
64, gfx1250::OperandType::OPR_SRC,
static_cast<int>(gfx1250::OpSelSrc::OPR_SRC_SRC_FLAT_SCRATCH_BASE_LO));
ASSERT_EQ(flat_scratch_base.read_scalar64(*wf), kScratchBase);

const uint64_t private_offsets[] = {0x10, 0x14, kPrivateSegmentSize + 0x20};
uint32_t vbase = wf->vgpr_alloc().base;
for (uint32_t lane = 0; lane < 3; ++lane) {
uint64_t private_offset = private_offsets[lane];
uint64_t flat_private_addr =
kScratchBase + (static_cast<uint64_t>(lane) << 52) + private_offset;
cu->write_vgpr(vbase, lane, static_cast<uint32_t>(flat_private_addr));
cu->write_vgpr(vbase + 1, lane, static_cast<uint32_t>(flat_private_addr >> 32));
}

gfx1250::VflatMachineInst inst{};
inst.saddr = gfx1250::OPR_SREG_NULL;
inst.vaddr = 0;
inst.ioffset = 4;

amdgpu::VectorMemState d(amdgpu::GLOBAL_MEM);
gfx1250::flat_calculate_addresses(inst, *wf, d);

for (uint32_t lane = 0; lane < 3; ++lane) {
uint64_t private_offset = private_offsets[lane] + inst.ioffset;
uint64_t expected =
kScratchBase + static_cast<uint64_t>(lane) * kPrivateSegmentSize + private_offset;
EXPECT_EQ(d.per_lane_addr[lane], expected) << "lane " << lane;
}
}

TEST(RdnaAddrCalcTest, Rdna4SmemSoffsetHandlesNullM0AndSgprSelectors) {
amdgpu::GpuMemory mem("rdna4_smem_addr_mem");
amdgpu::L2Cache l2("rdna4_smem_addr_l2");
Expand Down
Loading