diff --git a/src/execve/exit.c b/src/execve/exit.c index 7d959221..43dec57e 100644 --- a/src/execve/exit.c +++ b/src/execve/exit.c @@ -139,9 +139,14 @@ static int bind_proc_pid_auxv(const Tracee *ptracee) /** * Convert @mappings into load @script statements at the given @cursor - * position. This function returns the new cursor position. + * position. When @is_pic is true the PIE action variants are emitted + * so the loader will let the kernel choose the base address for the + * first segment (avoiding hardcoded EXEC_PIC_ADDRESS / INTERP_PIC_ADDRESS + * that may collide with vdso/kshare guard zones on some ARM64 kernels), + * then MAP_FIXED subsequent segments at the correct offsets. + * This function returns the new cursor position. */ -static void *transcript_mappings(void *cursor, const Mapping *mappings) +static void *transcript_mappings(void *cursor, const Mapping *mappings, bool is_pic) { size_t nb_mappings; size_t i; @@ -151,9 +156,9 @@ static void *transcript_mappings(void *cursor, const Mapping *mappings) LoadStatement *statement = cursor; if ((mappings[i].flags & MAP_ANONYMOUS) != 0) - statement->action = LOAD_ACTION_MMAP_ANON; + statement->action = is_pic ? LOAD_ACTION_MMAP_PIC_ANON : LOAD_ACTION_MMAP_ANON; else - statement->action = LOAD_ACTION_MMAP_FILE; + statement->action = is_pic ? LOAD_ACTION_MMAP_PIC_FILE : LOAD_ACTION_MMAP_FILE; statement->mmap.addr = mappings[i].addr; statement->mmap.length = mappings[i].length; @@ -208,7 +213,9 @@ static int transfer_load_script(Tracee *tracee) needs_executable_stack = (tracee->load_info->needs_executable_stack || ( tracee->load_info->interp != NULL && tracee->load_info->interp->needs_executable_stack)); - + bool exec_is_pic = IS_POSITION_INDENPENDANT(tracee->load_info->elf_header); + bool interp_is_pic = tracee->load_info->interp != NULL + && IS_POSITION_INDENPENDANT(tracee->load_info->interp->elf_header); /* Strings addresses are required to generate the load script, * for "open" actions. Since I want to generate it in one * pass, these strings will be put right below the current @@ -269,7 +276,7 @@ static int transfer_load_script(Tracee *tracee) cursor += LOAD_STATEMENT_SIZE(*statement, open); /* Load script statements: mmap. */ - cursor = transcript_mappings(cursor, tracee->load_info->mappings); + cursor = transcript_mappings(cursor, tracee->load_info->mappings, exec_is_pic); if (tracee->load_info->interp != NULL) { /* Load script statement: open. */ @@ -280,7 +287,7 @@ static int transfer_load_script(Tracee *tracee) cursor += LOAD_STATEMENT_SIZE(*statement, open); /* Load script statements: mmap. */ - cursor = transcript_mappings(cursor, tracee->load_info->interp->mappings); + cursor = transcript_mappings(cursor, tracee->load_info->interp->mappings, interp_is_pic); entry_point = ELF_FIELD(tracee->load_info->interp->elf_header, entry); } diff --git a/src/loader/loader.c b/src/loader/loader.c index 512c6bae..10abd1bb 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -43,9 +43,9 @@ # define MMAP_OFFSET_SHIFT 0 #endif -#define FATAL() do { \ - SYSCALL(EXIT, 1, 182); \ - __builtin_unreachable(); \ +#define FATAL() do { \ + SYSCALL(EXIT, 1, 182); \ + __builtin_unreachable(); \ } while (0) #define unlikely(expr) __builtin_expect(!!(expr), 0) @@ -113,6 +113,14 @@ void _start(void *cursor) bool traced = false; bool reset_at_base = true; word_t at_base = 0; + /* PIE relocation: the first segment of each PIE binary is mapped with + * addr=0 to let the kernel choose a conflict-free base. pic_delta is + * the difference between the kernel-chosen base and the original + * planned address; it is applied to all subsequent segments and to + * entry_point / auxv values at startup. */ + word_t pic_delta = 0; + word_t exec_pic_delta = 0; + bool has_interp = false; word_t fd = -1; word_t status; @@ -122,6 +130,9 @@ void _start(void *cursor) switch (stmt->action) { case LOAD_ACTION_OPEN_NEXT: + exec_pic_delta = pic_delta; + pic_delta = 0; + has_interp = true; status = SYSCALL(CLOSE, 1, fd); if (unlikely((int) status < 0)) FATAL(); @@ -160,6 +171,41 @@ void _start(void *cursor) cursor += LOAD_STATEMENT_SIZE(*stmt, mmap); break; + case LOAD_ACTION_MMAP_PIC_FILE: + if (reset_at_base) { + /* First segment of a PIE binary: let the kernel + * choose a conflict-free base address. */ + status = SYSCALL(MMAP, 6, 0, stmt->mmap.length, + stmt->mmap.prot, MAP_PRIVATE, fd, + stmt->mmap.offset >> MMAP_OFFSET_SHIFT); + /* Use IS_ERR_VALUE-style check: on 32-bit targets, + * valid high addresses (e.g. 0xb7...) look negative + * when cast to long; mmap errors are in [-4095,-1]. */ + if (unlikely(status >= (word_t)-4095)) + FATAL(); + pic_delta = status - stmt->mmap.addr; + at_base = status; + reset_at_base = false; + } else { + /* Subsequent segments: MAP_FIXED at delta-adjusted + * address within the kernel-assigned region. */ + word_t adjusted = stmt->mmap.addr + pic_delta; + status = SYSCALL(MMAP, 6, adjusted, stmt->mmap.length, + stmt->mmap.prot, MAP_PRIVATE | MAP_FIXED, fd, + stmt->mmap.offset >> MMAP_OFFSET_SHIFT); + if (unlikely(status != adjusted)) + FATAL(); + } + + if (stmt->mmap.clear_length != 0) { + word_t actual = stmt->mmap.addr + pic_delta; + clear(actual + stmt->mmap.length - stmt->mmap.clear_length, + actual + stmt->mmap.length); + } + + cursor += LOAD_STATEMENT_SIZE(*stmt, mmap); + break; + case LOAD_ACTION_MMAP_ANON: status = SYSCALL(MMAP, 6, stmt->mmap.addr, stmt->mmap.length, stmt->mmap.prot, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); @@ -169,6 +215,29 @@ void _start(void *cursor) cursor += LOAD_STATEMENT_SIZE(*stmt, mmap); break; + case LOAD_ACTION_MMAP_PIC_ANON: + if (reset_at_base) { + /* First segment (anon) of a PIE binary: let the + * kernel choose the base address. */ + status = SYSCALL(MMAP, 6, 0, stmt->mmap.length, + stmt->mmap.prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + /* IS_ERR_VALUE-style: see LOAD_ACTION_MMAP_PIC_FILE. */ + if (unlikely(status >= (word_t)-4095)) + FATAL(); + pic_delta = status - stmt->mmap.addr; + at_base = status; + reset_at_base = false; + } else { + word_t adjusted = stmt->mmap.addr + pic_delta; + status = SYSCALL(MMAP, 6, adjusted, stmt->mmap.length, + stmt->mmap.prot, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + if (unlikely(status != adjusted)) + FATAL(); + } + + cursor += LOAD_STATEMENT_SIZE(*stmt, mmap); + break; + case LOAD_ACTION_MAKE_STACK_EXEC: SYSCALL(MPROTECT, 3, stmt->make_stack_exec.start, 1, @@ -185,6 +254,14 @@ void _start(void *cursor) word_t *cursor2 = (word_t *) stmt->start.stack_pointer; const word_t argc = cursor2[0]; const word_t at_execfn = cursor2[1]; + /* Apply PIE relocation deltas computed at mmap time. + * entry_point comes from the interp (if present) or exec, + * so it uses the current pic_delta. at_phdr and at_entry + * always reference the executable, so they use its delta. */ + const word_t eff_exec_delta = has_interp ? exec_pic_delta : pic_delta; + const word_t actual_entry = stmt->start.entry_point + pic_delta; + const word_t actual_at_phdr = stmt->start.at_phdr + eff_exec_delta; + const word_t actual_at_entry = stmt->start.at_entry + eff_exec_delta; word_t name; status = SYSCALL(CLOSE, 1, fd); @@ -209,7 +286,7 @@ void _start(void *cursor) do { switch (cursor2[0]) { case AT_PHDR: - cursor2[1] = stmt->start.at_phdr; + cursor2[1] = actual_at_phdr; break; case AT_PHENT: @@ -221,7 +298,7 @@ void _start(void *cursor) break; case AT_ENTRY: - cursor2[1] = stmt->start.at_entry; + cursor2[1] = actual_at_entry; break; case AT_BASE: @@ -248,9 +325,9 @@ void _start(void *cursor) if (unlikely(traced)) SYSCALL(EXECVE, 6, 1, stmt->start.stack_pointer, - stmt->start.entry_point, 2, 3, 4); + actual_entry, 2, 3, 4); else - BRANCH(stmt->start.stack_pointer, stmt->start.entry_point); + BRANCH(stmt->start.stack_pointer, actual_entry); FATAL(); } diff --git a/src/loader/script.h b/src/loader/script.h index 6ae76213..b1ddc0bb 100644 --- a/src/loader/script.h +++ b/src/loader/script.h @@ -74,5 +74,11 @@ typedef struct load_statement LoadStatement; #define LOAD_ACTION_MAKE_STACK_EXEC 4 #define LOAD_ACTION_START_TRACED 5 #define LOAD_ACTION_START 6 +/* PIE variants: loader performs mmap(addr=0) on the first segment to obtain a + * kernel-assigned base address, then applies a fixed delta to all subsequent + * segments. This avoids EXEC_PIC_ADDRESS / INTERP_PIC_ADDRESS falling inside + * device-specific protection zones (e.g. ~2 GB vdso guard region on some ARM64 kernels). */ +#define LOAD_ACTION_MMAP_PIC_FILE 7 +#define LOAD_ACTION_MMAP_PIC_ANON 8 #endif /* SCRIPT */ diff --git a/src/tracee/event.c b/src/tracee/event.c index 756aa7eb..7a75c6a3 100644 --- a/src/tracee/event.c +++ b/src/tracee/event.c @@ -404,6 +404,10 @@ int handle_tracee_event(Tracee *tracee, int tracee_status) signal = 0; if (WIFEXITED(tracee_status)) { + /* No vpid==1 guard here (unlike WIFSIGNALED below): this is + * upstream behavior, and normal WIFEXITED ordering has root + * exiting last — there is no bulk-SIGKILL cleanup phase that + * could overwrite the status as with WIFSIGNALED. */ last_exit_status = WEXITSTATUS(tracee_status); VERBOSE(tracee, 1, "vpid %" PRIu64 ": exited with status %d", @@ -411,10 +415,18 @@ int handle_tracee_event(Tracee *tracee, int tracee_status) terminate_tracee(tracee); } else if (WIFSIGNALED(tracee_status)) { + int termsig = WTERMSIG(tracee_status); check_architecture(tracee); + /* Only the root tracee (vpid 1) should determine proot's exit + * code. Child tracees killed during cleanup (e.g. SIGKILL after + * root exits) must not overwrite it — upstream never set + * last_exit_status in WIFSIGNALED at all, and blindly doing so + * causes proot to return 137 when children are reaped. */ + if (tracee->vpid == 1) + last_exit_status = 128 + termsig; VERBOSE(tracee, (int) (tracee->vpid != 1), "vpid %" PRIu64 ": terminated with signal %d", - tracee->vpid, WTERMSIG(tracee_status)); + tracee->vpid, termsig); terminate_tracee(tracee); } else if (WIFSTOPPED(tracee_status)) {