@@ -471,7 +471,6 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t,
471
471
SupportedArch syscall_arch = t->detect_syscall_arch ();
472
472
t->canonicalize_regs (syscall_arch);
473
473
if (!process_syscall_entry (t, step_state, result, syscall_arch)) {
474
- step_state->continue_type = RecordSession::DONT_CONTINUE;
475
474
return ;
476
475
}
477
476
*did_enter_syscall = true ;
@@ -508,6 +507,8 @@ static void seccomp_trap_done(RecordTask* t) {
508
507
(uint8_t )1 );
509
508
}
510
509
510
+ extern void disarm_desched_event (RecordTask *t);
511
+ extern void leave_syscallbuf (RecordTask *t);
511
512
static void handle_seccomp_trap (RecordTask* t,
512
513
RecordSession::StepState* step_state,
513
514
uint16_t seccomp_data) {
@@ -542,27 +543,21 @@ static void handle_seccomp_trap(RecordTask* t,
542
543
}
543
544
}
544
545
545
- if (t->is_in_untraced_syscall ()) {
546
- ASSERT (t, !t->delay_syscallbuf_reset_for_seccomp_trap );
547
- // Don't reset the syscallbuf immediately after delivering the trap. We have
548
- // to wait until this buffered syscall aborts completely before resetting
549
- // the buffer.
550
- t->delay_syscallbuf_reset_for_seccomp_trap = true ;
551
-
552
- t->push_event (Event::seccomp_trap ());
553
-
546
+ bool is_untraced_syscall = t->is_in_untraced_syscall ();
547
+ if (is_untraced_syscall) {
554
548
// desched may be armed but we're not going to execute the syscall, let
555
- // alone block. If it fires, ignore it.
556
- t-> write_mem (
557
- REMOTE_PTR_FIELD (t-> syscallbuf_child , desched_signal_may_be_relevant),
558
- ( uint8_t ) 0 );
549
+ // alone block. Disarm the event and if it fires, ignore it.
550
+ disarm_desched_event (t);
551
+ leave_syscallbuf (t);
552
+ r = t-> regs ( );
559
553
}
560
554
555
+ t->canonicalize_regs (t->detect_syscall_arch ());
561
556
t->push_syscall_event (syscallno);
562
557
t->ev ().Syscall ().failed_during_preparation = true ;
563
558
note_entering_syscall (t);
564
559
565
- if (t-> is_in_untraced_syscall () && !syscall_entry_already_recorded) {
560
+ if (is_untraced_syscall && !syscall_entry_already_recorded) {
566
561
t->record_current_event ();
567
562
}
568
563
@@ -578,10 +573,21 @@ static void handle_seccomp_trap(RecordTask* t,
578
573
si.native_api .si_code = SYS_SECCOMP;
579
574
si.native_api ._sifields ._sigsys ._arch = to_audit_arch (r.arch ());
580
575
si.native_api ._sifields ._sigsys ._syscall = syscallno;
576
+
581
577
// Documentation says that si_call_addr is the address of the syscall
582
578
// instruction, but in tests it's immediately after the syscall
583
579
// instruction.
584
- si.native_api ._sifields ._sigsys ._call_addr = t->ip ().to_data_ptr <void >();
580
+ remote_code_ptr seccomp_ip = t->ip ();
581
+
582
+ /* If we actually deliver this signal, we will fudge the ip value to instead
583
+ point into the patched-out syscall. The callee may rely on these values
584
+ matching, so do the same adjustment here. */
585
+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_jump_stub (seccomp_ip, true );
586
+ if (ps) {
587
+ seccomp_ip = (ps->patch_addr + (seccomp_ip - ps->stub_addr .as_int ()).register_value () - (ps->size - ps->safe_suffix )).as_int ();
588
+ }
589
+
590
+ si.native_api ._sifields ._sigsys ._call_addr = seccomp_ip.to_data_ptr <void >();
585
591
LOG (debug) << " Synthesizing " << si.linux_api ;
586
592
t->stash_synthetic_sig (si.linux_api , DETERMINISTIC_SIG);
587
593
@@ -591,16 +597,31 @@ static void handle_seccomp_trap(RecordTask* t,
591
597
t->set_regs (r);
592
598
t->maybe_restore_original_syscall_registers ();
593
599
594
- if (t->is_in_untraced_syscall ()) {
600
+ if (is_untraced_syscall) {
601
+ Registers r = t->regs ();
602
+ // Cause kernel processing to skip the syscall
603
+ r.set_original_syscallno (SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
604
+ t->set_regs (r);
605
+ uintptr_t orig_arg1 = r.arg1 ();
606
+
607
+ // The tracee is currently in the seccomp ptrace-stop or syscall-entry stop.
608
+ // Advance it to the syscall-exit stop so that when we try to deliver the SIGSYS via
609
+ // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
610
+ t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
611
+ if (t->status ().ptrace_event () == PTRACE_EVENT_SECCOMP) {
612
+ t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
613
+ }
614
+
615
+ if (t->arch () == aarch64) {
616
+ r = t->regs ();
617
+ r.set_arg1 (orig_arg1);
618
+ t->set_regs (r);
619
+ }
620
+
595
621
// For buffered syscalls, go ahead and record the exit state immediately.
596
622
t->ev ().Syscall ().state = EXITING_SYSCALL;
597
623
t->record_current_event ();
598
624
t->pop_syscall ();
599
-
600
- // The tracee is currently in the seccomp ptrace-stop. Advance it to the
601
- // syscall-exit stop so that when we try to deliver the SIGSYS via
602
- // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
603
- t->resume_execution (RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
604
625
}
605
626
606
627
// Don't continue yet. At the next iteration of record_step, if we
@@ -815,12 +836,6 @@ void RecordSession::task_continue(const StepState& step_state) {
815
836
// A task in an emulated ptrace-stop must really stay stopped
816
837
ASSERT (t, !t->emulated_stop_pending );
817
838
818
- bool may_restart = t->at_may_restart_syscall ();
819
-
820
- if (may_restart && t->seccomp_bpf_enabled ) {
821
- LOG (debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev ();
822
- }
823
-
824
839
if (!t->vm ()->first_run_event ()) {
825
840
t->vm ()->set_first_run_event (trace_writer ().time ());
826
841
}
@@ -892,7 +907,7 @@ void RecordSession::task_continue(const StepState& step_state) {
892
907
makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE
893
908
traps.
894
909
Detect and handle this. */
895
- if (!t->seccomp_bpf_enabled || may_restart ||
910
+ if (!t->seccomp_bpf_enabled ||
896
911
syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) {
897
912
resume = RESUME_SYSCALL;
898
913
} else {
@@ -1232,6 +1247,17 @@ void RecordSession::syscall_state_changed(RecordTask* t,
1232
1247
ASSERT (t, t->regs ().original_syscallno () == -1 );
1233
1248
}
1234
1249
rec_did_sigreturn (t);
1250
+
1251
+ /* The inverse of the processing we do during signal delivery - if the IP
1252
+ points into a region that we patched out, move us to the extended jump
1253
+ patch instead. */
1254
+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_syscall_patch (t->ip ());
1255
+ if (ps) {
1256
+ Registers r = t->regs ();
1257
+ r.set_ip ((ps->stub_addr + (r.ip () - ps->patch_addr .as_int ()).register_value () + (ps->size - ps->safe_suffix )).as_int ());
1258
+ t->set_regs (r);
1259
+ }
1260
+
1235
1261
t->record_current_event ();
1236
1262
t->pop_syscall ();
1237
1263
@@ -1500,6 +1526,7 @@ static bool inject_handled_signal(RecordTask* t) {
1500
1526
t->stashed_signal_processed ();
1501
1527
1502
1528
int sig = t->ev ().Signal ().siginfo .si_signo ;
1529
+
1503
1530
do {
1504
1531
// We are ready to inject our signal.
1505
1532
// XXX we assume the kernel won't respond by notifying us of a different
@@ -1557,6 +1584,69 @@ static bool inject_handled_signal(RecordTask* t) {
1557
1584
return true ;
1558
1585
}
1559
1586
1587
+ static ssize_t get_sigframe_size (SupportedArch arch) {
1588
+ if (is_x86ish (arch)) {
1589
+ // It's somewhat difficult engineering-wise to
1590
+ // compute the sigframe size at compile time,
1591
+ // and it can vary across kernel versions and CPU
1592
+ // microarchitectures. So this size is an overestimate
1593
+ // of the real size(s).
1594
+ //
1595
+ // If this size becomes too small in the
1596
+ // future, and unit tests that use sighandlers
1597
+ // are run with checksumming enabled, then
1598
+ // they can catch errors here.
1599
+ return 1152 /* Overestimate of kernel sigframe */ +
1600
+ 128 /* Redzone */ +
1601
+ /* this returns 512 when XSAVE unsupported */
1602
+ xsave_area_size ();
1603
+ } else if (arch) {
1604
+ return sizeof (ARM64Arch::rt_sigframe) +
1605
+ sizeof (ARM64Arch::user_fpsimd_state);
1606
+ } else {
1607
+ DEBUG_ASSERT (0 && " Add sigframe size for your architecture here" );
1608
+ return 0 ;
1609
+ }
1610
+ }
1611
+
1612
+ template <typename Arch>
1613
+ static remote_ptr<typename Arch::unsigned_long> get_sigframe_ip_ptr (remote_ptr<typename Arch::rt_sigframe> frame_ptr);
1614
+
1615
+ template <>
1616
+ remote_ptr<ARM64Arch::unsigned_long> get_sigframe_ip_ptr<ARM64Arch>(remote_ptr<ARM64Arch::rt_sigframe> frame_ptr) {
1617
+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), regs), pc);
1618
+ }
1619
+
1620
+ template <>
1621
+ remote_ptr<X86Arch::unsigned_long> get_sigframe_ip_ptr<X86Arch>(remote_ptr<X86Arch::rt_sigframe> frame_ptr) {
1622
+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), ip);
1623
+ }
1624
+
1625
+ template <>
1626
+ remote_ptr<X64Arch::unsigned_long> get_sigframe_ip_ptr<X64Arch>(remote_ptr<X64Arch::rt_sigframe> frame_ptr) {
1627
+ return REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (REMOTE_PTR_FIELD (frame_ptr, uc), uc_mcontext), ip);
1628
+ }
1629
+
1630
+ template <typename Arch>
1631
+ static remote_code_ptr get_sigframe_ip_arch (RecordTask *t, remote_ptr<typename Arch::rt_sigframe> frame_ptr)
1632
+ {
1633
+ return t->read_mem (get_sigframe_ip_ptr<Arch>(frame_ptr));
1634
+ }
1635
+
1636
+ static remote_code_ptr get_sigframe_ip (RecordTask *t, remote_ptr<void > frame_ptr) {
1637
+ RR_ARCH_FUNCTION (get_sigframe_ip_arch, t->arch (), t, frame_ptr.as_int ());
1638
+ }
1639
+
1640
+ template <typename Arch>
1641
+ static void set_sigframe_ip_arch (RecordTask *t, remote_ptr<typename Arch::rt_sigframe> frame_ptr, remote_code_ptr ip)
1642
+ {
1643
+ t->write_mem (get_sigframe_ip_ptr<Arch>(frame_ptr), (typename Arch::unsigned_long)ip.register_value ());
1644
+ }
1645
+
1646
+ static void set_sigframe_ip (RecordTask *t, remote_ptr<void > frame_ptr, remote_code_ptr ip) {
1647
+ RR_ARCH_FUNCTION (set_sigframe_ip_arch, t->arch (), t, frame_ptr.as_int (), ip);
1648
+ }
1649
+
1560
1650
/* *
1561
1651
* |t| is being delivered a signal, and its state changed.
1562
1652
* Must call t->stashed_signal_processed() once we're ready to unmask signals.
@@ -1601,26 +1691,37 @@ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) {
1601
1691
break ;
1602
1692
}
1603
1693
1604
- if (is_x86ish (t->arch ())) {
1605
- // It's somewhat difficult engineering-wise to
1606
- // compute the sigframe size at compile time,
1607
- // and it can vary across kernel versions and CPU
1608
- // microarchitectures. So this size is an overestimate
1609
- // of the real size(s).
1610
- //
1611
- // If this size becomes too small in the
1612
- // future, and unit tests that use sighandlers
1613
- // are run with checksumming enabled, then
1614
- // they can catch errors here.
1615
- sigframe_size = 1152 /* Overestimate of kernel sigframe */ +
1616
- 128 /* Redzone */ +
1617
- /* this returns 512 when XSAVE unsupported */
1618
- xsave_area_size ();
1619
- } else if (t->arch () == aarch64) {
1620
- sigframe_size = sizeof (ARM64Arch::rt_sigframe) +
1621
- sizeof (ARM64Arch::user_fpsimd_state);
1622
- } else {
1623
- DEBUG_ASSERT (0 && " Add sigframe size for your architecture here" );
1694
+ sigframe_size = get_sigframe_size (t->arch ());
1695
+
1696
+ /*
1697
+ * If we're delivering a signal while in the extended jump patch, pretend we're in the
1698
+ * unpatched code instead. That way, any unwinder that makes use of CFI for unwinding
1699
+ * will see the correct unwind info of the patch site rather than that of the extended
1700
+ * jump patch. The instruction sequence in the original code was of course altered by
1701
+ * the patch, so if the signal handler inspects that, it might get confused. However,
1702
+ * that is already a general problem with our patching strategy, in that the application
1703
+ * is not allowed to read its own code.
1704
+ * Naturally, we need to perform the inverse transformation in sigreturn.
1705
+ *
1706
+ * N.B.: We do this by modifying the sigframe after signal deliver, rather
1707
+ * than modifying the registers during signal delivery, because on some platforms
1708
+ * (e.g. aarch64, the kernel will adjust the pre-signal registers after the signal stop).
1709
+ */
1710
+ remote_ptr<ARM64Arch::rt_sigframe> sigframe = t->regs ().sp ().cast <ARM64Arch::rt_sigframe>();
1711
+ remote_code_ptr ip = get_sigframe_ip (t, sigframe);
1712
+ Monkeypatcher::patched_syscall *ps = t->vm ()->monkeypatcher ().find_jump_stub (ip, true );
1713
+ if (ps) {
1714
+ uint64_t translated_patch_offset = (ip - ps->stub_addr .as_int ()).register_value () - (ps->size - ps->safe_suffix );
1715
+ // We patch out the jump stub with nop, but of course, if we happen to find ourselves
1716
+ // in the middle of the nop sled, we just want to end up at the end of the patch
1717
+ // region.
1718
+ size_t total_patch_region_size = ps->hook ->patch_region_length +
1719
+ rr::syscall_instruction_length (t->arch ());
1720
+ if (translated_patch_offset > total_patch_region_size) {
1721
+ translated_patch_offset = total_patch_region_size;
1722
+ }
1723
+ set_sigframe_ip (t, sigframe, ps->patch_addr .as_int () + translated_patch_offset);
1724
+ LOG (debug) << " Moved ip from extended jump patch to patch area" ;
1624
1725
}
1625
1726
1626
1727
t->ev ().transform (EV_SIGNAL_HANDLER);
@@ -1909,32 +2010,22 @@ static bool is_ptrace_any_sysemu(SupportedArch arch, int command)
1909
2010
bool RecordSession::process_syscall_entry (RecordTask* t, StepState* step_state,
1910
2011
RecordResult* step_result,
1911
2012
SupportedArch syscall_arch) {
1912
- if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD ()) {
1913
- // The only four cases where we allow a stashed signal to be pending on
1914
- // syscall entry are:
1915
- // -- the signal is a ptrace-related signal, in which case if it's generated
1916
- // during a blocking syscall, it does not interrupt the syscall
1917
- // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and
1918
- // lets us dispatch signals afterward
1919
- // -- when we're entering a blocking untraced syscall. If it really blocks,
1920
- // we'll get the desched-signal notification and dispatch our stashed
1921
- // signal.
1922
- // -- when we're doing a privileged syscall that's internal to the preload
1923
- // logic
1924
- // We do not generally want to have stashed signals pending when we enter
1925
- // a syscall, because that will execute with a hacked signal mask
1926
- // (see RecordTask::will_resume_execution) which could make things go wrong.
1927
- ASSERT (t,
1928
- t->desched_rec () || is_rrcall_notify_syscall_hook_exit_syscall (
1929
- t->regs ().original_syscallno (), t->arch ()) ||
1930
- t->ip () ==
1931
- t->vm ()
1932
- ->privileged_traced_syscall_ip ()
1933
- .increment_by_syscall_insn_length (t->arch ()))
1934
- << " Stashed signal pending on syscall entry when it shouldn't be: "
1935
- << sig->siginfo << " ; regs=" << t->regs ()
1936
- << " ; last_execution_resume=" << t->last_execution_resume ()
1937
- << " ; sig ip=" << sig->ip ;
2013
+ if (!t->is_in_syscallbuf () && t->stashed_sig_not_synthetic_SIGCHLD ()) {
2014
+ // If we have a pending signal, deliver it as if it had happened just before
2015
+ // execution of the syscall instruction. To this end, kick us out of the
2016
+ // current syscall again and set up the registers for a restart. Regular
2017
+ // signal injection will do the rest.
2018
+ LOG (debug) << " Entered syscall, but signal pending - setting up pre-syscall signal delivery" ;
2019
+ Registers entry_regs = t->regs ();
2020
+ Registers r = entry_regs;
2021
+ // Cause kernel processing to skip the syscall
2022
+ r.set_original_syscallno (SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
2023
+ t->set_regs (r);
2024
+ t->exit_syscall ();
2025
+ entry_regs.set_ip (entry_regs.ip ().decrement_by_syscall_insn_length (syscall_arch));
2026
+ entry_regs.set_syscallno (entry_regs.original_syscallno ());
2027
+ t->set_regs (entry_regs);
2028
+ return false ;
1938
2029
}
1939
2030
1940
2031
// We just entered a syscall.
0 commit comments