-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
SPU: SPURS oriented thread waiting #17646
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
52b749a
7f96842
015d07d
7831276
01a16f9
f7f01f5
24bce5c
f9e093e
0f23399
29a836f
16f619d
8d667a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -490,6 +490,11 @@ namespace vm | |
|
|
||
| namespace spu | ||
| { | ||
| struct raw_spu_atomic_info_t | ||
| { | ||
| std::array<atomic_t<spu_atomic_op_info_for_group>, 8> raw_atomic_ops; | ||
| }; | ||
|
|
||
| namespace scheduler | ||
| { | ||
| std::array<atomic_t<u8>, 65536> atomic_instruction_table = {}; | ||
|
|
@@ -4699,6 +4704,159 @@ u32 evaluate_spin_optimization(std::span<u8> stats, u64 evaluate_time, const cfg | |
| return busy_waiting_switch; | ||
| } | ||
|
|
||
| inline u8 spu_to_index(const spu_thread* spu) noexcept | ||
| { | ||
| return spu->group ? (spu->lv2_id >> 24) : spu->lv2_id; | ||
| } | ||
|
|
||
| inline std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& get_spu_atomic_op_info(const spu_thread* spu) noexcept | ||
| { | ||
| return spu->group ? spu->group->atomic_ops : g_fxo->get<spu::raw_spu_atomic_info_t>().raw_atomic_ops; | ||
| } | ||
|
|
||
| // To be used by GETLLAR | ||
| // Returns none-zero if needs to wait | ||
| int test_and_update_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc) | ||
| { | ||
| auto info = spu_info[index].load(); | ||
|
|
||
| if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc) | ||
| { | ||
| if (info.addr % 128) | ||
| { | ||
| info.addr &= -128; | ||
| spu_info[index].release(info); | ||
| return 0; | ||
| } | ||
|
|
||
| // Repeated GETLLAR: disable entry | ||
| } | ||
|
|
||
| info = {}; | ||
|
|
||
| spu_info[index].release(info); | ||
|
|
||
| for (usz i = 0; i < spu_info.size(); i++) | ||
| { | ||
| info = spu_info[i].load(); | ||
|
|
||
| if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc) | ||
| { | ||
| int wait = 0; | ||
|
|
||
| spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value) | ||
| { | ||
| wait = 0; | ||
|
|
||
| if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc) | ||
| { | ||
| if (value.addr % 128 == 0) | ||
| { | ||
| wait = 2; | ||
| return false; | ||
| } | ||
|
|
||
| if (value.addr & (1u << index)) | ||
| { | ||
| value.addr &= ~(1u << index); | ||
| wait = 1; | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| return false; | ||
| }); | ||
|
|
||
| if (wait) | ||
| { | ||
| return wait; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| // To be used when PUTLLC finishes to create a temporary barrier until the SPURS loop restarts | ||
| void downgrade_to_temporary_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc) | ||
| { | ||
| auto info = spu_info[index].load(); | ||
|
|
||
| if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc) | ||
| { | ||
| info.addr |= 127; | ||
| spu_info[index].release(info); | ||
| return; | ||
| } | ||
|
|
||
| info = {}; | ||
| spu_info[index].release(info); | ||
| } | ||
|
|
||
| void release_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index) | ||
| { | ||
| spu_info[index].release(spu_atomic_op_info_for_group{}); | ||
| } | ||
|
|
||
| // To be used by PUTLLC initiates | ||
| // Returns none-zero if needs to wait | ||
| int init_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc) | ||
| { | ||
| // Initialiy store locked entry with temporary lock | ||
| spu_atomic_op_info_for_group info{}; | ||
| info.addr = raddr | 127; | ||
| info.getllar = getllar_pc; | ||
|
|
||
| spu_info[index].release(info); | ||
|
|
||
| for (usz i = 0; i < spu_info.size(); i++) | ||
| { | ||
| if (i == index) | ||
| { | ||
| continue; | ||
| } | ||
|
|
||
| info = spu_info[i].load(); | ||
|
|
||
| if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc) | ||
| { | ||
| int wait = 0; | ||
|
|
||
| spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value) | ||
| { | ||
| wait = 0; | ||
|
|
||
| if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc) | ||
| { | ||
| if (value.addr % 128 == 0) | ||
| { | ||
| wait = 2; | ||
| return false; | ||
| } | ||
|
|
||
| if (value.addr & (1u << index)) | ||
| { | ||
| value.addr &= ~(1u << index); | ||
| wait = 1; | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| return false; | ||
| }); | ||
|
|
||
| return wait; | ||
| } | ||
| } | ||
|
|
||
| // If exclusive, upgrade to full lock | ||
| info.addr = raddr; | ||
| info.getllar = getllar_pc; | ||
| spu_info[index].store(info); | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| bool spu_thread::process_mfc_cmd() | ||
| { | ||
| // Stall infinitely if MFC queue is full | ||
|
|
@@ -4772,7 +4930,7 @@ bool spu_thread::process_mfc_cmd() | |
| if (raddr != addr) | ||
| { | ||
| // Last check for event before we replace the reservation with a new one | ||
| if (reservation_check(raddr, rdata)) | ||
| if (~ch_events.load().events & SPU_EVENT_LR && reservation_check(raddr, rdata, addr)) | ||
| { | ||
| set_events(SPU_EVENT_LR); | ||
| } | ||
|
|
@@ -5015,11 +5173,50 @@ bool spu_thread::process_mfc_cmd() | |
| last_getllar = pc; | ||
| last_gtsc = perf0.get(); | ||
| } | ||
| else | ||
| { | ||
| last_getllar = pc; | ||
| } | ||
|
|
||
| last_getllar_addr = addr; | ||
| getllar_spin_count = 0; | ||
| getllar_busy_waiting_switch = umax; | ||
|
|
||
| if (ch_mfc_cmd.eal == spurs_addr) | ||
| { | ||
| u64 timeout = 0; | ||
|
|
||
| while (true) | ||
| { | ||
| const int wait = test_and_update_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), ch_mfc_cmd.eal, last_getllar); | ||
|
|
||
| if (!wait) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| const u64 current = get_system_time(); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've pointed it out before, but get_system_time is unreasonably heavy. Prefer TSC unless real-world precise values are required. A general note - the spu_info logic (test_and_update_atomic_op_info) in general is quite heavy-handed with all the atomic ops and may eat into performance. The biggest issue I see is that there is no fast-path through this calling sequence (and the corresponding one below). Yes, spurs itself is going to be almost always running task groups but we also observe that in most games the parallel misses themselves aren't too bad with modern processors, though I agree we need something more sophisticated than the quick hack that was the preferred threads option. |
||
|
|
||
| if (!timeout) | ||
| { | ||
| timeout = current + g_cfg.core.spu_delay_penalty * 1000; | ||
| } | ||
| else if (current >= timeout) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| if (wait == 2) | ||
| { | ||
| std::this_thread::yield(); | ||
| } | ||
| else | ||
| { | ||
| busy_wait(50000); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| u64 ntime = 0; | ||
| rsx::reservation_lock rsx_lock(addr, 128); | ||
|
|
||
|
|
@@ -5232,6 +5429,41 @@ bool spu_thread::process_mfc_cmd() | |
| } | ||
| } | ||
|
|
||
| if (ch_mfc_cmd.eal == spurs_addr) | ||
| { | ||
| u64 timeout = 0; | ||
|
|
||
| while (true) | ||
| { | ||
| const int wait = init_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar); | ||
|
|
||
| if (!wait) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| const u64 current = get_system_time(); | ||
|
|
||
| if (!timeout) | ||
| { | ||
| timeout = current + g_cfg.core.spu_delay_penalty * 1000; | ||
| } | ||
| else if (current >= timeout) | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| if (wait == 2) | ||
| { | ||
| std::this_thread::yield(); | ||
| } | ||
| else | ||
| { | ||
| busy_wait(50000); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (do_putllc(ch_mfc_cmd)) | ||
| { | ||
| ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS); | ||
|
|
@@ -5299,6 +5531,7 @@ bool spu_thread::process_mfc_cmd() | |
| std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128); | ||
| } | ||
|
|
||
| downgrade_to_temporary_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar); | ||
| static_cast<void>(test_stopped()); | ||
| return true; | ||
| } | ||
|
|
@@ -5482,7 +5715,7 @@ bool spu_thread::process_mfc_cmd() | |
| ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size); | ||
| } | ||
|
|
||
| bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const | ||
| bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal) const | ||
| { | ||
| if (!addr) | ||
| { | ||
|
|
@@ -5501,9 +5734,24 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const | |
| return !cmp_rdata(data, *vm::get_super_ptr<decltype(rdata)>(addr)); | ||
| } | ||
|
|
||
| if ((addr >> 20) == (current_eal >> 20)) | ||
| { | ||
| if (vm::check_addr(addr, vm::page_1m_size)) | ||
| { | ||
| // Same random-access-memory page as the current MFC command, assume allocated | ||
| return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr)); | ||
| } | ||
|
|
||
| if ((addr >> 16) == (current_eal >> 16) && vm::check_addr(addr, vm::page_64k_size)) | ||
| { | ||
| // Same random-access-memory page as the current MFC command, assume allocated | ||
| return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr)); | ||
| } | ||
| } | ||
|
|
||
| // Ensure data is allocated (HACK: would raise LR event if not) | ||
| // Set range_lock first optimistically | ||
| range_lock->store(u64{128} << 32 | addr); | ||
| range_lock->store(u64{128} << 32 | addr | vm::range_readable); | ||
|
|
||
| u64 lock_val = *std::prev(std::end(vm::g_range_lock_set)); | ||
| u64 old_lock = 0; | ||
|
|
@@ -5579,12 +5827,12 @@ bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_ | |
| if ((addr >> 28) < 2 || (addr >> 28) == 0xd) | ||
| { | ||
| // Always-allocated memory does not need strict checking (vm::main or vm::stack) | ||
| return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) == hash; | ||
| return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) != hash; | ||
| } | ||
|
|
||
| // Ensure data is allocated (HACK: would raise LR event if not) | ||
| // Set range_lock first optimistically | ||
| range_lock->store(u64{128} << 32 | addr); | ||
| range_lock->store(u64{128} << 32 | addr | vm::range_readable); | ||
|
|
||
| u64 lock_val = *std::prev(std::end(vm::g_range_lock_set)); | ||
| u64 old_lock = 0; | ||
|
|
@@ -6165,7 +6413,9 @@ s64 spu_thread::get_ch_value(u32 ch) | |
|
|
||
| eventstat_busy_waiting_switch = value ? 1 : 0; | ||
| } | ||
|
|
||
|
|
||
| release_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this)); | ||
|
|
||
| for (bool is_first = true; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true), is_first = false) | ||
| { | ||
| const auto old = +state; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can abuse vector ops for this sequence and gain implicit atomicity.
Have the spu info as an object of arrays instead of array of objects.
Then you can just load all of them at once and (ab)use vector ops on the vector to figure out how much overlap there is.
On x86 at least, vector ops are atomic as long as they are naturally aligned too so we basically get that for free.