diff --git a/src/runtime/extern.go b/src/runtime/extern.go index afadc3d17ec341..654031f50c6f39 100644 --- a/src/runtime/extern.go +++ b/src/runtime/extern.go @@ -320,3 +320,14 @@ const GOOS string = goos.GOOS // GOARCH is the running program's architecture target: // one of 386, amd64, arm, s390x, and so on. const GOARCH string = goarch.GOARCH + +// Variable occlumentry is a flag to distinguish a tee environment or not. +// occlumentry's value is from Auxv, 0 is default value, nonzero if execution +// binary was loaded by a tee libos, for example occlum. +var occlumentry uintptr = 0x0 +var teeFlag = false + +// isTeeEnvironment return true if it's in a TEE environment. +func isTeeEnvironment() bool { + return teeFlag +} diff --git a/src/runtime/internal/syscall/asm_linux_amd64.s b/src/runtime/internal/syscall/asm_linux_amd64.s index 3740ef1beb570d..982f631e2fd2fb 100644 --- a/src/runtime/internal/syscall/asm_linux_amd64.s +++ b/src/runtime/internal/syscall/asm_linux_amd64.s @@ -32,7 +32,14 @@ TEXT ·Syscall6(SB),NOSPLIT,$0 MOVQ CX, SI // a2 MOVQ BX, DI // a1 // num already in AX. - SYSCALL + CMPQ runtime·occlumentry(SB), $0x0 + JBE 12(PC) + BYTE $0x48; BYTE $0x8d; BYTE $0x0d; BYTE $0x0a; BYTE $0x00; BYTE $0x00; BYTE $0x00 + MOVQ runtime·occlumentry(SB), R11 + JMP R11 + PXOR X15, X15 + JMP 2(PC) + SYSCALL CMPQ AX, $0xfffffffffffff001 JLS ok NEGQ AX diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 7ff2190876dd1b..5faa10aab6dc94 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -608,7 +608,8 @@ func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, register bool) (v unsa } // Try to grow the heap at a hint address. - for *hintList != nil { + // Skip hintlist address malloc in Tee environment. + for !teeFlag && *hintList != nil { hint := *hintList p := hint.addr if hint.down { @@ -662,13 +663,15 @@ func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, register bool) (v unsa return nil, 0 } - // Create new hints for extending this region. - hint := (*arenaHint)(h.arenaHintAlloc.alloc()) - hint.addr, hint.down = uintptr(v), true - hint.next, mheap_.arenaHints = mheap_.arenaHints, hint - hint = (*arenaHint)(h.arenaHintAlloc.alloc()) - hint.addr = uintptr(v) + size - hint.next, mheap_.arenaHints = mheap_.arenaHints, hint + if !teeFlag { + // Create new hints for extending this region. + hint := (*arenaHint)(h.arenaHintAlloc.alloc()) + hint.addr, hint.down = uintptr(v), true + hint.next, mheap_.arenaHints = mheap_.arenaHints, hint + hint = (*arenaHint)(h.arenaHintAlloc.alloc()) + hint.addr = uintptr(v) + size + hint.next, mheap_.arenaHints = mheap_.arenaHints, hint + } } // Check for bad pointers or pointers we can't use. diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go index 26db4a0cd94459..d2b7916e1c6c28 100644 --- a/src/runtime/os_linux.go +++ b/src/runtime/os_linux.go @@ -219,6 +219,7 @@ const ( _AT_SECURE = 23 // secure mode boolean _AT_RANDOM = 25 // introduced in 2.6.29 _AT_HWCAP2 = 26 // hardware capability bit vector 2 + _AT_OCCLUM = 48 // gnu syscall ABI entry address ) var procAuxv = []byte("/proc/self/auxv\x00") @@ -304,6 +305,10 @@ func sysauxv(auxv []uintptr) int { case _AT_SECURE: secureMode = val == 1 + + case _AT_OCCLUM: + occlumentry = val + teeFlag = true } archauxv(tag, val) diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index c7a89ba5363405..5a382105bdd7ce 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -51,7 +51,7 @@ TEXT runtime·exit(SB),NOSPLIT,$0-4 MOVL code+0(FP), DI MOVL $SYS_exit_group, AX - SYSCALL + SYSCALL_ENHANCE RET // func exitThread(wait *atomic.Uint32) @@ -61,7 +61,7 @@ TEXT runtime·exitThread(SB),NOSPLIT,$0-8 MOVL $0, (AX) MOVL $0, DI // exit code MOVL $SYS_exit, AX - SYSCALL + SYSCALL_ENHANCE // We may not even have a stack any more. INT $3 JMP 0(PC) @@ -73,7 +73,7 @@ TEXT runtime·open(SB),NOSPLIT,$0-20 MOVL mode+8(FP), DX MOVL perm+12(FP), R10 MOVL $SYS_openat, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $-1, AX @@ -83,7 +83,7 @@ TEXT runtime·open(SB),NOSPLIT,$0-20 TEXT runtime·closefd(SB),NOSPLIT,$0-12 MOVL fd+0(FP), DI MOVL $SYS_close, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $-1, AX @@ -95,7 +95,7 @@ TEXT runtime·write1(SB),NOSPLIT,$0-28 MOVQ p+8(FP), SI MOVL n+16(FP), DX MOVL $SYS_write, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -104,7 +104,7 @@ TEXT runtime·read(SB),NOSPLIT,$0-28 MOVQ p+8(FP), SI MOVL n+16(FP), DX MOVL $SYS_read, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -113,7 +113,7 @@ TEXT runtime·pipe2(SB),NOSPLIT,$0-20 LEAQ r+8(FP), DI MOVL flags+0(FP), SI MOVL $SYS_pipe2, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, errno+16(FP) RET @@ -131,40 +131,40 @@ TEXT runtime·usleep(SB),NOSPLIT,$16 MOVQ SP, DI MOVL $0, SI MOVL $SYS_nanosleep, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT runtime·gettid(SB),NOSPLIT,$0-4 MOVL $SYS_gettid, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+0(FP) RET TEXT runtime·raise(SB),NOSPLIT,$0 MOVL $SYS_getpid, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, R12 MOVL $SYS_gettid, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, SI // arg 2 tid MOVL R12, DI // arg 1 pid MOVL sig+0(FP), DX // arg 3 MOVL $SYS_tgkill, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT runtime·raiseproc(SB),NOSPLIT,$0 MOVL $SYS_getpid, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, DI // arg 1 pid MOVL sig+0(FP), SI // arg 2 MOVL $SYS_kill, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT ·getpid(SB),NOSPLIT,$0-8 MOVL $SYS_getpid, AX - SYSCALL + SYSCALL_ENHANCE MOVQ AX, ret+0(FP) RET @@ -173,7 +173,7 @@ TEXT ·tgkill(SB),NOSPLIT,$0 MOVQ tid+8(FP), SI MOVQ sig+16(FP), DX MOVL $SYS_tgkill, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT runtime·setitimer(SB),NOSPLIT,$0-24 @@ -181,7 +181,7 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0-24 MOVQ new+8(FP), SI MOVQ old+16(FP), DX MOVL $SYS_setittimer, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT runtime·timer_create(SB),NOSPLIT,$0-28 @@ -189,7 +189,7 @@ TEXT runtime·timer_create(SB),NOSPLIT,$0-28 MOVQ sevp+8(FP), SI MOVQ timerid+16(FP), DX MOVL $SYS_timer_create, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -199,14 +199,14 @@ TEXT runtime·timer_settime(SB),NOSPLIT,$0-28 MOVQ new+8(FP), DX MOVQ old+16(FP), R10 MOVL $SYS_timer_settime, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET TEXT runtime·timer_delete(SB),NOSPLIT,$0-12 MOVL timerid+0(FP), DI MOVL $SYS_timer_delete, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+8(FP) RET @@ -215,7 +215,7 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-28 MOVQ n+8(FP), SI MOVQ dst+16(FP), DX MOVL $SYS_mincore, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -282,7 +282,7 @@ ret: RET fallback: MOVQ $SYS_clock_gettime, AX - SYSCALL + SYSCALL_ENHANCE JMP ret TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28 @@ -291,7 +291,7 @@ TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28 MOVQ old+16(FP), DX MOVL size+24(FP), R10 MOVL $SYS_rt_sigprocmask, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $0xf1, 0xf1 // crash @@ -303,7 +303,7 @@ TEXT runtime·rt_sigaction(SB),NOSPLIT,$0-36 MOVQ old+16(FP), DX MOVQ size+24(FP), R10 MOVL $SYS_rt_sigaction, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+32(FP) RET @@ -466,7 +466,7 @@ sigtrampnog: // https://gcc.gnu.org/viewcvs/gcc/trunk/libgcc/config/i386/linux-unwind.h?revision=219188&view=markup TEXT runtime·sigreturn(SB),NOSPLIT,$0 MOVQ $SYS_rt_sigreturn, AX - SYSCALL + SYSCALL_ENHANCE INT $3 // not reached TEXT runtime·sysMmap(SB),NOSPLIT,$0 @@ -478,7 +478,7 @@ TEXT runtime·sysMmap(SB),NOSPLIT,$0 MOVL off+28(FP), R9 MOVL $SYS_mmap, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS ok NOTQ AX @@ -513,7 +513,7 @@ TEXT runtime·sysMunmap(SB),NOSPLIT,$0 MOVQ addr+0(FP), DI MOVQ n+8(FP), SI MOVQ $SYS_munmap, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $0xf1, 0xf1 // crash @@ -537,7 +537,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVQ n+8(FP), SI MOVL flags+16(FP), DX MOVQ $SYS_madvise, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -551,12 +551,15 @@ TEXT runtime·futex(SB),NOSPLIT,$0 MOVQ addr2+24(FP), R8 MOVL val3+32(FP), R9 MOVL $SYS_futex, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+40(FP) RET // int32 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void)); TEXT runtime·clone(SB),NOSPLIT,$0 + CMPQ runtime·occlumentry(SB), $0x0 + JNE occlum + MOVL flags+0(FP), DI MOVQ stk+8(FP), SI MOVQ $0, DX @@ -620,11 +623,110 @@ nog2: SYSCALL JMP -3(PC) // keep exiting +occlum: + MOVL flags+0(FP), DI + MOVQ stk+8(FP), SI + MOVQ $0, DX + MOVQ $0, R10 + MOVQ $0, R8 + // Copy mp, gp, fn off parent stack for use by child. + // Careful: Linux system call clobbers CX and R11. + MOVQ mp+16(FP), R13 + MOVQ gp+24(FP), R9 + MOVQ fn+32(FP), R12 + CMPQ R13, $0 // m + JEQ occlum_nog1 + CMPQ R9, $0 // g + JEQ occlum_nog1 + LEAQ m_tls(R13), R8 +#ifdef GOOS_android + // Android stores the TLS offset in runtime·tls_g. + SUBQ runtime·tls_g(SB), R8 +#else + ADDQ $8, R8 // ELF wants to use -8(FS) +#endif + ORQ $0x00080000, DI //add flag CLONE_SETTLS(0x00080000) to call clone +occlum_nog1: + // flags + MOVQ DI, -16(SI) + // fn + MOVQ R12, -24(SI) + // gp + MOVQ R9, -32(SI) + // m + MOVQ R13, -40(SI) + // lea 0x23(%rip),%rax + BYTE $0x48; BYTE $0x8d; BYTE $0x05; BYTE $0x23; BYTE $0x00; BYTE $0x00; BYTE $0x00 + // thread entry point + MOVQ AX, -8(SI) + + SUBQ $8, SI + + MOVL $SYS_clone, AX + // BYTE $0xcc + // lea 0xa(%rip),%rcx + BYTE $0x48; BYTE $0x8d; BYTE $0x0d; BYTE $0x0a; BYTE $0x00; BYTE $0x00; BYTE $0x00 + MOVQ runtime·occlumentry(SB), R11 + JMP R11 + + // In parent, return. + MOVL AX, ret+40(FP) + RET + +thread_entrypoint: + // add 8, %rsp + // MOVQ SI, SP + ADDQ $8, SP + + MOVQ SP, SI + + // mov -24(%rsp), %r12 + // fn + BYTE $0x4c; BYTE $0x8b; BYTE $0x64; BYTE $0x24; BYTE $0xe8 + + // mov -32(%rsp), %r9 + // gp + BYTE $0x4c; BYTE $0x8b; BYTE $0x4c; BYTE $0x24; BYTE $0xe0 + + // mov -40(%rsp), %r13 + // m + BYTE $0x4c; BYTE $0x8b; BYTE $0x6c; BYTE $0x24; BYTE $0xd8 + + // BYTE $0xcc + + // If g or m are nil, skip Go-related setup. + CMPQ R13, $0 // m + JEQ occlum_nog2 + CMPQ R9, $0 // g + JEQ occlum_nog2 + + // Initialize m->procid to Linux tid + MOVL $SYS_gettid, AX + SYSCALL_ENHANCE + MOVQ AX, m_procid(R13) + + // In child, set up new stack + get_tls(CX) + MOVQ R13, g_m(R9) + MOVQ R9, g(CX) + MOVQ R9, R14 // set g register + CALL runtime·stackcheck(SB) + +occlum_nog2: + // Call fn + CALL R12 + + // It shouldn't return. If it does, exit that thread. + MOVL $111, DI + MOVL $SYS_exit, AX + SYSCALL_ENHANCE + JMP -3(PC) // keep exiting + TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 MOVQ new+0(FP), DI MOVQ old+8(FP), SI MOVQ $SYS_sigaltstack, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $0xf1, 0xf1 // crash @@ -641,7 +743,7 @@ TEXT runtime·settls(SB),NOSPLIT,$32 MOVQ DI, SI MOVQ $0x1002, DI // ARCH_SET_FS MOVQ $SYS_arch_prctl, AX - SYSCALL + SYSCALL_ENHANCE CMPQ AX, $0xfffffffffffff001 JLS 2(PC) MOVL $0xf1, 0xf1 // crash @@ -649,7 +751,7 @@ TEXT runtime·settls(SB),NOSPLIT,$32 TEXT runtime·osyield(SB),NOSPLIT,$0 MOVL $SYS_sched_yield, AX - SYSCALL + SYSCALL_ENHANCE RET TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0 @@ -657,7 +759,7 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0 MOVQ len+8(FP), SI MOVQ buf+16(FP), DX MOVL $SYS_sched_getaffinity, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -669,7 +771,7 @@ TEXT runtime·access(SB),NOSPLIT,$0 MOVL mode+8(FP), DX MOVL $0, R10 MOVL $SYS_faccessat, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+16(FP) RET @@ -679,7 +781,7 @@ TEXT runtime·connect(SB),NOSPLIT,$0-28 MOVQ addr+8(FP), SI MOVL len+16(FP), DX MOVL $SYS_connect, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+24(FP) RET @@ -689,7 +791,7 @@ TEXT runtime·socket(SB),NOSPLIT,$0-20 MOVL typ+4(FP), SI MOVL prot+8(FP), DX MOVL $SYS_socket, AX - SYSCALL + SYSCALL_ENHANCE MOVL AX, ret+16(FP) RET @@ -698,6 +800,6 @@ TEXT runtime·sbrk0(SB),NOSPLIT,$0-8 // Implemented as brk(NULL). MOVQ $0, DI MOVL $SYS_brk, AX - SYSCALL + SYSCALL_ENHANCE MOVQ AX, ret+0(FP) RET diff --git a/src/runtime/textflag.h b/src/runtime/textflag.h index 214075e360ceb6..41cfcd38451681 100644 --- a/src/runtime/textflag.h +++ b/src/runtime/textflag.h @@ -37,3 +37,21 @@ #define TOPFRAME 2048 // Function is an ABI wrapper. #define ABIWRAPPER 4096 + +// SYSCALL_ENHANCE is for SGX TEE's syscall enhancement, it will hook syscall +// and callback to a special glibc or musl syscall stub. +// step one: assign syscall return address to register rcx, occlum know where +// should return according to rcx. +// step two: jump to syscall interface address provided by occlum when go +// .bin file loaded. +// +// actually is the assembler instruction: lea 0xa(%rip),%rcx +#define SYSCALL_ENHANCE \ + CMPQ runtime·occlumentry(SB), $0x0 \ + JBE 12(PC) \ + BYTE $0x48; BYTE $0x8d; BYTE $0x0d; BYTE $0x0a; BYTE $0x00; BYTE $0x00; BYTE $0x00 \ + MOVQ runtime·occlumentry(SB), R11 \ + JMP R11 \ + PXOR X15, X15 \ + JMP 2(PC) \ + SYSCALL diff --git a/src/runtime/time_linux_amd64.s b/src/runtime/time_linux_amd64.s index 1416d232304f39..2867a8a67ca870 100644 --- a/src/runtime/time_linux_amd64.s +++ b/src/runtime/time_linux_amd64.s @@ -77,11 +77,11 @@ ret: fallback: MOVQ $SYS_clock_gettime, AX - SYSCALL + SYSCALL_ENHANCE MOVL $1, DI // CLOCK_MONOTONIC LEAQ 0(SP), SI MOVQ $SYS_clock_gettime, AX - SYSCALL + SYSCALL_ENHANCE JMP ret diff --git a/src/syscall/asm_linux_amd64.s b/src/syscall/asm_linux_amd64.s index 00d6fedc62b775..7e4b267303a0a6 100644 --- a/src/syscall/asm_linux_amd64.s +++ b/src/syscall/asm_linux_amd64.s @@ -21,7 +21,7 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40 MOVQ $0, R9 MOVQ trap+0(FP), AX // syscall entry POPQ R12 // preserve return address - SYSCALL + SYSCALL_ENHANCE PUSHQ R12 CMPQ AX, $0xfffffffffffff001 JLS ok2 @@ -40,7 +40,7 @@ TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48 MOVQ a2+16(FP), SI MOVQ a3+24(FP), DX MOVQ trap+0(FP), AX // syscall entry - SYSCALL + SYSCALL_ENHANCE MOVQ AX, r1+32(FP) MOVQ DX, r2+40(FP) RET @@ -61,7 +61,7 @@ ret: RET fallback: MOVL $SYS_gettimeofday, AX - SYSCALL + SYSCALL_ENHANCE JMP ret ok7: MOVQ $0, err+8(FP) diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index dcb3d51b5f7298..0d2e10b5ecf48e 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -172,6 +172,9 @@ type caps struct { data [2]capData } +//go:linkname isTeeEnvironment runtime.isTeeEnvironment +func isTeeEnvironment() bool + // See CAP_TO_INDEX in linux/capability.h: func capToIndex(cap uintptr) uintptr { return cap >> 5 } @@ -289,15 +292,24 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att // No more allocation or calls of non-assembly functions. runtime_BeforeFork() locked = true - if clone3 != nil { - r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3)) - } else { + // Replace syscall clone with vfork here, vfork needs no parameters, + // the giving parameters here will be ignored. + // vfork and exec could create a new process in occlum while clone not. + if isTeeEnvironment() { + const SYS_VFORK = 58 flags |= uintptr(SIGCHLD) - if runtime.GOARCH == "s390x" { - // On Linux/s390, the first two arguments of clone(2) are swapped. - r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags) + r1, err1 = rawVforkSyscall(SYS_VFORK, flags, 0) + } else { + if clone3 != nil { + r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3)) } else { - r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0) + flags |= uintptr(SIGCHLD) + if runtime.GOARCH == "s390x" { + // On Linux/s390, the first two arguments of clone(2) are swapped. + r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags) + } else { + r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0) + } } } if err1 != 0 || r1 != 0 {