diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 0761a263..b94e7b3f 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -243,10 +243,48 @@ void translate_syscall(Tracee *tracee) tracee->restart_how = PTRACE_SYSCALL; } - /* Set syscall arguments to make it fail - * TODO: More reliable way to make invalid arguments - * For most syscalls we set all args to -1 - * Hoping there is among them invalid request/address/fd/value that will make syscall fail */ + /* Handle syscall rejection when the syscall number can't be modified. + * + * Normal path: proot sets the syscall number to PR_void so the + * kernel runs a harmless no-op, then overrides the return-value + * register at sysexit with the real error code. On some kernels + * the dedicated syscall-number regset is absent/refused (on + * arm64 this is PTRACE_SETREGSET(NT_ARM_SYSTEM_CALL) returning + * EINVAL; see push_specific_regs() in tracee/reg.c which bails + * out before even attempting the general-register push), and + * we land in this workaround branch. + * + * Legacy strategy was to poke all 6 syscall args to -1 and + * re-push the general register state while keeping the syscall + * number set to PR_void, so the kernel still saw an illegal + * syscall number and rejected the call with ENOSYS. That works + * on stock kernels, but on some kernels restarting with the + * syscall-number register set to PR_void triggers a + * non-standard signal delivery path that synthesizes a SIGSEGV + * and kills the tracee before it executes a single user-mode + * instruction. + * + * Correct strategy: restore the original syscall number so the + * kernel actually runs the rejected syscall, and poke all 6 + * args to -1 so the syscall fails naturally inside the kernel + * (EFAULT/EBADF/EINVAL). The real error code is written to the + * return-value register by proot at sysexit. + * + * Known limitation: + * syscalls that ignore arguments (e.g. getpid/sync) or take + * fewer than 6 args will not necessarily fail inside the + * kernel, so they will actually execute with whatever state + * the tracee already has. We accept this: (a) the legacy + * "keep sysnum=PR_void" path is strictly worse on affected + * kernels — it kills the tracee with SIGSEGV; (b) -1 in every + * arg slot already traps the overwhelming majority of + * side-effectful syscalls at the kernel's parameter-validation + * stage (EBADF/EFAULT/EINVAL); (c) we have no empirically + * grounded list of syscalls that both reach this suppression + * path and cause harmful side effects when run with poisoned + * args, so a speculative allow/deny list would be dead code. + * The real return value is still overridden at sysexit. */ + poke_reg(tracee, SYSARG_NUM, orig_sysnum); /* restore original sysnum; PR_void in the syscall-number register triggers a non-standard SIGSEGV path on some kernels */ poke_reg(tracee, SYSARG_1, -1); poke_reg(tracee, SYSARG_2, -1); poke_reg(tracee, SYSARG_3, -1); @@ -254,11 +292,6 @@ void translate_syscall(Tracee *tracee) poke_reg(tracee, SYSARG_5, -1); poke_reg(tracee, SYSARG_6, -1); - if (get_sysnum(tracee, ORIGINAL) == PR_brk) { - /* For brk() we pass 0 as first arg; this is used to query value without changing it */ - poke_reg(tracee, SYSARG_1, 0); - } - /* Push regs again without changing syscall */ push_regs_status = push_specific_regs(tracee, false); if (push_regs_status != 0) { diff --git a/src/tracee/reg.c b/src/tracee/reg.c index 3859f8e5..db5b4773 100644 --- a/src/tracee/reg.c +++ b/src/tracee/reg.c @@ -332,12 +332,30 @@ int push_specific_regs(Tracee *tracee, bool including_sysnum) /* Update syscall number if needed. On arm64, a new * subcommand has been added to PTRACE_{S,G}ETREGSET - * to allow write/read of current sycall number. */ + * to allow write/read of current sycall number. + * + * Kernel-capability cache: NT_ARM_SYSTEM_CALL is a + * kernel-global feature — if it is rejected once with + * EINVAL, it will never succeed under the same running + * kernel. We short-circuit subsequent requests so the + * caller (see syscall.c unmodifiable-sysnum workaround) + * hits its fallback path immediately without paying the + * cost of a guaranteed-failing ptrace on every intercepted + * syscall. Only EINVAL is cached: ESRCH/EPERM/EFAULT are + * per-tracee state issues, not kernel capability. Memory + * only (not persisted); a new proot run re-probes. */ + static bool sysnum_regset_unavailable = false; if (including_sysnum && current_sysnum != REG(tracee, ORIGINAL, SYSARG_NUM)) { + if (sysnum_regset_unavailable) { + errno = EINVAL; + return -1; + } regs.iov_base = ¤t_sysnum; regs.iov_len = sizeof(current_sysnum); status = ptrace(PTRACE_SETREGSET, tracee->pid, NT_ARM_SYSTEM_CALL, ®s); if (status < 0) { + if (errno == EINVAL) + sysnum_regset_unavailable = true; //note(tracee, WARNING, SYSTEM, "can't set the syscall number"); return status; }