Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion rpcs3/Emu/Cell/Modules/cellGem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2284,6 +2284,8 @@ error_code cellGemClearStatusFlags(u32 gem_num, u64 mask)

error_code cellGemConvertVideoFinish(ppu_thread& ppu)
{
ppu.state += cpu_flag::wait;

cellGem.warning("cellGemConvertVideoFinish()");

auto& gem = g_fxo->get<gem_config>();
Expand All @@ -2306,8 +2308,10 @@ error_code cellGemConvertVideoFinish(ppu_thread& ppu)
return CELL_OK;
}

error_code cellGemConvertVideoStart(vm::cptr<void> video_frame)
error_code cellGemConvertVideoStart(ppu_thread& ppu, vm::cptr<void> video_frame)
{
ppu.state += cpu_flag::wait;

cellGem.warning("cellGemConvertVideoStart(video_frame=*0x%x)", video_frame);

auto& gem = g_fxo->get<gem_config>();
Expand Down Expand Up @@ -2459,6 +2463,8 @@ error_code cellGemEnableMagnetometer2(u32 gem_num, u32 enable)

error_code cellGemEnd(ppu_thread& ppu)
{
ppu.state += cpu_flag::wait;

cellGem.warning("cellGemEnd()");

auto& gem = g_fxo->get<gem_config>();
Expand Down Expand Up @@ -3603,6 +3609,8 @@ error_code cellGemTrackHues(vm::cptr<u32> req_hues, vm::ptr<u32> res_hues)

error_code cellGemUpdateFinish(ppu_thread& ppu)
{
ppu.state += cpu_flag::wait;

cellGem.warning("cellGemUpdateFinish()");

auto& gem = g_fxo->get<gem_config>();
Expand Down
262 changes: 256 additions & 6 deletions rpcs3/Emu/Cell/SPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,11 @@ namespace vm

namespace spu
{
struct raw_spu_atomic_info_t
{
std::array<atomic_t<spu_atomic_op_info_for_group>, 8> raw_atomic_ops;
};

namespace scheduler
{
std::array<atomic_t<u8>, 65536> atomic_instruction_table = {};
Expand Down Expand Up @@ -4699,6 +4704,159 @@ u32 evaluate_spin_optimization(std::span<u8> stats, u64 evaluate_time, const cfg
return busy_waiting_switch;
}

inline u8 spu_to_index(const spu_thread* spu) noexcept
{
return spu->group ? (spu->lv2_id >> 24) : spu->lv2_id;
}

inline std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& get_spu_atomic_op_info(const spu_thread* spu) noexcept
{
return spu->group ? spu->group->atomic_ops : g_fxo->get<spu::raw_spu_atomic_info_t>().raw_atomic_ops;
}

// To be used by GETLLAR
// Returns none-zero if needs to wait
int test_and_update_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
{
auto info = spu_info[index].load();

if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
{
if (info.addr % 128)
{
info.addr &= -128;
spu_info[index].release(info);
return 0;
}

// Repeated GETLLAR: disable entry
}

info = {};

spu_info[index].release(info);

for (usz i = 0; i < spu_info.size(); i++)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can abuse vector ops for this sequence and gain implicit atomicity.
Have the spu info as an object of arrays instead of array of objects.
Then you can just load all of them at once and (ab)use vector ops on the vector to figure out how much overlap there is.

On x86 at least, vector ops are atomic as long as they are naturally aligned too so we basically get that for free.

{
info = spu_info[i].load();

if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
{
int wait = 0;

spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value)
{
wait = 0;

if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc)
{
if (value.addr % 128 == 0)
{
wait = 2;
return false;
}

if (value.addr & (1u << index))
{
value.addr &= ~(1u << index);
wait = 1;
return true;
}
}

return false;
});

if (wait)
{
return wait;
}
}
}

return 0;
}

// To be used when PUTLLC finishes to create a temporary barrier until the SPURS loop restarts
void downgrade_to_temporary_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
{
auto info = spu_info[index].load();

if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
{
info.addr |= 127;
spu_info[index].release(info);
return;
}

info = {};
spu_info[index].release(info);
}

void release_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index)
{
spu_info[index].release(spu_atomic_op_info_for_group{});
}

// To be used by PUTLLC initiates
// Returns none-zero if needs to wait
int init_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
{
// Initialiy store locked entry with temporary lock
spu_atomic_op_info_for_group info{};
info.addr = raddr | 127;
info.getllar = getllar_pc;

spu_info[index].release(info);

for (usz i = 0; i < spu_info.size(); i++)
{
if (i == index)
{
continue;
}

info = spu_info[i].load();

if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
{
int wait = 0;

spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value)
{
wait = 0;

if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc)
{
if (value.addr % 128 == 0)
{
wait = 2;
return false;
}

if (value.addr & (1u << index))
{
value.addr &= ~(1u << index);
wait = 1;
return true;
}
}

return false;
});

return wait;
}
}

// If exclusive, upgrade to full lock
info.addr = raddr;
info.getllar = getllar_pc;
spu_info[index].store(info);

return 0;
}

bool spu_thread::process_mfc_cmd()
{
// Stall infinitely if MFC queue is full
Expand Down Expand Up @@ -4772,7 +4930,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if (reservation_check(raddr, rdata))
if (~ch_events.load().events & SPU_EVENT_LR && reservation_check(raddr, rdata, addr))
{
set_events(SPU_EVENT_LR);
}
Expand Down Expand Up @@ -5015,11 +5173,50 @@ bool spu_thread::process_mfc_cmd()
last_getllar = pc;
last_gtsc = perf0.get();
}
else
{
last_getllar = pc;
}

last_getllar_addr = addr;
getllar_spin_count = 0;
getllar_busy_waiting_switch = umax;

if (ch_mfc_cmd.eal == spurs_addr)
{
u64 timeout = 0;

while (true)
{
const int wait = test_and_update_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), ch_mfc_cmd.eal, last_getllar);

if (!wait)
{
break;
}

const u64 current = get_system_time();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've pointed it out before, but get_system_time is unreasonably heavy. Prefer TSC unless real-world precise values are required.

A general note - the spu_info logic (test_and_update_atomic_op_info) in general is quite heavy-handed with all the atomic ops and may eat into performance. The biggest issue I see is that there is no fast-path through this calling sequence (and the corresponding one below). Yes, spurs itself is going to be almost always running task groups but we also observe that in most games the parallel misses themselves aren't too bad with modern processors, though I agree we need something more sophisticated than the quick hack that was the preferred threads option.
This is all theory of course, we'll just have to see if it ends up worth the overhead with the big hitters like RDR, TLOU or killzone titles.


if (!timeout)
{
timeout = current + g_cfg.core.spu_delay_penalty * 1000;
}
else if (current >= timeout)
{
break;
}

if (wait == 2)
{
std::this_thread::yield();
}
else
{
busy_wait(50000);
}
}
}

u64 ntime = 0;
rsx::reservation_lock rsx_lock(addr, 128);

Expand Down Expand Up @@ -5232,6 +5429,41 @@ bool spu_thread::process_mfc_cmd()
}
}

if (ch_mfc_cmd.eal == spurs_addr)
{
u64 timeout = 0;

while (true)
{
const int wait = init_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar);

if (!wait)
{
break;
}

const u64 current = get_system_time();

if (!timeout)
{
timeout = current + g_cfg.core.spu_delay_penalty * 1000;
}
else if (current >= timeout)
{
break;
}

if (wait == 2)
{
std::this_thread::yield();
}
else
{
busy_wait(50000);
}
}
}

if (do_putllc(ch_mfc_cmd))
{
ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
Expand Down Expand Up @@ -5299,6 +5531,7 @@ bool spu_thread::process_mfc_cmd()
std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
}

downgrade_to_temporary_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar);
static_cast<void>(test_stopped());
return true;
}
Expand Down Expand Up @@ -5482,7 +5715,7 @@ bool spu_thread::process_mfc_cmd()
ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
}

bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal) const
{
if (!addr)
{
Expand All @@ -5501,9 +5734,24 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
return !cmp_rdata(data, *vm::get_super_ptr<decltype(rdata)>(addr));
}

if ((addr >> 20) == (current_eal >> 20))
{
if (vm::check_addr(addr, vm::page_1m_size))
{
// Same random-access-memory page as the current MFC command, assume allocated
return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr));
}

if ((addr >> 16) == (current_eal >> 16) && vm::check_addr(addr, vm::page_64k_size))
{
// Same random-access-memory page as the current MFC command, assume allocated
return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr));
}
}

// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
range_lock->store(u64{128} << 32 | addr | vm::range_readable);

u64 lock_val = *std::prev(std::end(vm::g_range_lock_set));
u64 old_lock = 0;
Expand Down Expand Up @@ -5579,12 +5827,12 @@ bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_
if ((addr >> 28) < 2 || (addr >> 28) == 0xd)
{
// Always-allocated memory does not need strict checking (vm::main or vm::stack)
return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) == hash;
return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) != hash;
}

// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
range_lock->store(u64{128} << 32 | addr | vm::range_readable);

u64 lock_val = *std::prev(std::end(vm::g_range_lock_set));
u64 old_lock = 0;
Expand Down Expand Up @@ -6165,7 +6413,9 @@ s64 spu_thread::get_ch_value(u32 ch)

eventstat_busy_waiting_switch = value ? 1 : 0;
}


release_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this));

for (bool is_first = true; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true), is_first = false)
{
const auto old = +state;
Expand Down
Loading
Loading