//go:build linux || android package proot import ( "errors" "fmt" "io" "os" "path/filepath" "runtime" "strconv" "strings" "sync" "syscall" "time" "sirherobrine23.com.br/go-bds/exec/v2/process" prootext "sirherobrine23.com.br/go-bds/exec/v2/proot/extensions/extensions" ) const atFDCWD = -100 // Linux ptrace tracers must wait with __WALL so clone/vfork/thread stops // generated by PTRACE_O_TRACE* are visible even when they are not normal // children according to wait4(2). Without it, helpers spawned by apt/dpkg can // disappear or stay stopped while the tracer blocks waiting for an event that // is hidden from plain wait4(-1, ...). const waitTraceOptions = syscall.WALL // Keep scratch strings/vectors outside the AMD64 red-zone and similar // architecture/compiler scratch area below the user stack pointer. Most path // rewrites are only needed for the duration of a single syscall, but writing // them immediately below SP can corrupt leaf-function red-zone locals in libc, // dpkg, or apt helpers. A small gap keeps the injected strings away from that // live user stack area without going far enough below SP to commonly hit an // unmapped guard page. const stackScratchGap = 256 const ( processGroupShutdownTimeout = 90 * time.Second processGroupKillGrace = 5 * time.Second ) var errProcessExited = errors.New("process already exited") type nativeProcess struct { process *os.Process pid int pgid int stdin io.Reader stdout, stderr io.Writer stdinW *os.File stdoutR *os.File stderrR *os.File copyWG sync.WaitGroup // stdin-only; never waited by Wait() outputWG sync.WaitGroup childClose []io.Closer parentClose []io.Closer cleanup []func() error stdioStarted bool tracer *tracer done chan error once sync.Once mu sync.Mutex exitCode int exited bool } func startNative(pr *Proot, config prootext.Config, cleanups []func() error, options *process.Exec) (*nativeProcess, error) { pm, err := newPathMapper(config.Rootfs, config.PathResolvers) if err != nil { return nil, err } cwd := options.Cwd if cwd == "" { cwd = "/" } cwd = cleanGuestPath(cwd) hostCwd := pm.GuestToHost(cwd) args := append([]string(nil), options.Arguments...) execPath := args[0] if resolution, handled, err := resolveExecExtensions(config.ExecResolvers, pm, cwd, args); err != nil { return nil, err } else if handled { execPath = resolution.ExecPath args = resolution.Args } else { guestCmd, hostCmd := pm.Translate(cwd, args[0]) if rw, changed := pm.resolveExec(hostCmd, guestCmd, args); changed { execPath = rw.ExecPath args = rw.Argv } else { execPath = hostCmd args[0] = hostCmd } } p := &nativeProcess{done: make(chan error, 1), exitCode: -1, cleanup: cleanups} if pr.proc != nil { p.stdin, p.stdout, p.stderr = pr.proc.stdin, pr.proc.stdout, pr.proc.stderr } if options.Stdin != nil { p.stdin = options.Stdin } if options.Stdout != nil { p.stdout = options.Stdout } if options.Stderr != nil { p.stderr = options.Stderr } files, err := p.prepareFiles() if err != nil { return nil, err } envOverlay := options.Environment if envOverlay == nil { envOverlay = process.Env{} } if lp := pm.libraryPath(); lp != "" { if old := envOverlay["LD_LIBRARY_PATH"]; old != "" { envOverlay["LD_LIBRARY_PATH"] = lp + ":" + old } else { envOverlay["LD_LIBRARY_PATH"] = lp } } envOverlay["PWD"] = cwd env := mergeEnv(os.Environ(), envOverlay) attr := &os.ProcAttr{ Dir: hostCwd, Env: env, Files: files, Sys: &syscall.SysProcAttr{ Ptrace: true, Setpgid: true, }, } proc, err := os.StartProcess(execPath, args, attr) if err != nil { p.closeAllPipes() return nil, err } // The parent must close the child-side pipe descriptors immediately. // Otherwise stdout/stderr copy loops never observe EOF because the tracer // process itself still owns a write end of the pipe. p.closeChildPipes() p.process = proc p.pid = proc.Pid p.pgid = proc.Pid p.startCopyLoops() p.tracer = newTracer(pm, cwd, uint32(pr.UID), uint32(pr.GID)) go func() { runtime.LockOSThread() defer runtime.UnlockOSThread() if options.Context != nil { go func() { <-options.Context.Done() p.interruptThenKillAfter(processGroupKillGrace) }() } err := p.tracer.loop(p.pid) // Only wait for helper processes after a clean ptrace loop. If tracer // setup itself failed (for example a transient ESRCH from // PTRACE_SETOPTIONS), the root can still be stopped under ptrace and // remain visible in the process group forever. Kill that incomplete // tracee tree immediately instead of waiting the normal 90-second // package-manager shutdown window. if err != nil { p.tracer.debugf("ptrace loop failed for root pid=%d: %v; terminating process group", p.pid, err) _ = syscall.Kill(-p.pgid, syscall.SIGKILL) p.reapProcessGroupChildren() } else { // The ptrace root can legitimately disappear before package-manager // helper children in the same process group have finished flushing // stdout/stderr and modifying the rootfs. Do not report process // completion until the whole process group is gone. if waitErr := p.waitProcessGroupGone(processGroupShutdownTimeout); waitErr != nil { err = waitErr } } // Stop the parent-side stdio plumbing before waiting for copy loops. // In the root-gone/detach paths the traced process tree may have already // disappeared without delivering the final pipe EOF in the usual order. // Closing stdout/stderr readers here wakes the copy goroutines so Wait() // cannot hang after the ptrace loop has already completed. p.closeParentInputPipes() p.closeParentOutputPipes() p.outputWG.Wait() err = errors.Join(err, p.cleanupExtensions()) p.mu.Lock() if p.tracer.exitCodeSet { p.exitCode = p.tracer.exitCode } p.exited = true p.mu.Unlock() _ = p.process.Release() p.done <- err }() return p, nil } func resolveExecExtensions(resolvers []prootext.ExecResolver, pm *pathMapper, cwd string, args []string) (prootext.ExecResolution, bool, error) { for i := len(resolvers) - 1; i >= 0; i-- { resolver := resolvers[i] resolution, err := resolver.ResolveExec(prootext.ExecRequest{ Cwd: cwd, Args: append([]string(nil), args...), Translate: func(cwd, name string, mode prootext.PathMode) (string, string) { return pm.TranslateMode(cwd, name, mode) }, }) if err != nil { return prootext.ExecResolution{}, false, err } if resolution.Handled { return resolution, true, nil } } return prootext.ExecResolution{}, false, nil } func (p *nativeProcess) cleanupExtensions() error { var err error for i := len(p.cleanup) - 1; i >= 0; i-- { err = errors.Join(err, p.cleanup[i]()) } p.cleanup = nil return err } func mergeEnv(base []string, overlay map[string]string) []string { idx := make(map[string]int, len(base)) out := append([]string(nil), base...) for i, kv := range out { if eq := strings.IndexByte(kv, '='); eq >= 0 { idx[kv[:eq]] = i } } for k, v := range overlay { kv := k + "=" + v if i, ok := idx[k]; ok { out[i] = kv } else { idx[k] = len(out) out = append(out, kv) } } return out } func (p *nativeProcess) prepareFiles() ([]*os.File, error) { files := []*os.File{os.Stdin, os.Stdout, os.Stderr} if p.stdin != nil { if f, ok := p.stdin.(*os.File); ok { files[0] = f } else { r, w, err := os.Pipe() if err != nil { return nil, err } p.stdinW = w p.childClose = append(p.childClose, r) p.parentClose = append(p.parentClose, w) files[0] = r } } if p.stdout != nil { r, w, err := os.Pipe() if err != nil { return nil, err } p.stdoutR = r p.childClose = append(p.childClose, w) p.parentClose = append(p.parentClose, r) files[1] = w } if p.stderr != nil { r, w, err := os.Pipe() if err != nil { return nil, err } p.stderrR = r p.childClose = append(p.childClose, w) p.parentClose = append(p.parentClose, r) files[2] = w } return files, nil } func (p *nativeProcess) startCopyLoops() { if p.stdioStarted { return } p.stdioStarted = true if p.stdin != nil && p.stdinW != nil { // Do not include stdin in the output wait path. A generic Reader can block // forever waiting for terminal input, and Wait() must report process exit // independently from stdin draining. p.copyWG.Add(1) go func() { defer p.copyWG.Done() _, _ = io.Copy(p.stdinW, p.stdin) _ = p.stdinW.Close() }() } if p.stdout != nil && p.stdoutR != nil { p.outputWG.Add(1) go func() { defer p.outputWG.Done() _, _ = io.Copy(p.stdout, p.stdoutR) }() } if p.stderr != nil && p.stderrR != nil { p.outputWG.Add(1) go func() { defer p.outputWG.Done() _, _ = io.Copy(p.stderr, p.stderrR) }() } } func (p *nativeProcess) waitProcessGroupGone(timeout time.Duration) error { if p.pgid <= 0 { return nil } deadline := time.Now().Add(timeout) interruptSent := false killSent := false for { p.reapProcessGroupChildren() if !p.processGroupExists() { return nil } now := time.Now() switch { case !interruptSent && now.After(deadline): p.tracer.debugf("process group %d still alive after %s; sending SIGINT", p.pgid, timeout) _ = syscall.Kill(-p.pgid, syscall.SIGINT) interruptSent = true deadline = now.Add(processGroupKillGrace) case interruptSent && !killSent && now.After(deadline): p.tracer.debugf("process group %d ignored SIGINT; sending SIGKILL", p.pgid) _ = syscall.Kill(-p.pgid, syscall.SIGKILL) killSent = true deadline = now.Add(time.Second) case killSent && now.After(deadline): p.reapProcessGroupChildren() if p.processGroupExists() { return fmt.Errorf("proot: process group %d did not exit after SIGKILL", p.pgid) } return nil } time.Sleep(20 * time.Millisecond) } } func (p *nativeProcess) processGroupExists() bool { return processGroupExists(p.pgid) } func processGroupExists(pgid int) bool { if pgid <= 0 { return false } live, err := processGroupLiveMembers(pgid) if err == nil { return len(live) > 0 } err = syscall.Kill(-pgid, 0) return err == nil || errors.Is(err, syscall.EPERM) } func processGroupLiveMembers(pgid int) ([]int, error) { entries, err := os.ReadDir("/proc") if err != nil { return nil, err } var live []int for _, entry := range entries { if !entry.Type().IsDir() { continue } pid, err := strconv.Atoi(entry.Name()) if err != nil { continue } procPGID, state, err := readProcStatProcessGroup(pid) if err != nil || procPGID != pgid || state == 'Z' { continue } live = append(live, pid) } return live, nil } func readProcStatProcessGroup(pid int) (int, byte, error) { data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) if err != nil { return 0, 0, err } end := strings.LastIndexByte(string(data), ')') if end < 0 || end+2 >= len(data) { return 0, 0, fmt.Errorf("invalid /proc/%d/stat", pid) } state := data[end+2] fields := strings.Fields(string(data[end+3:])) if len(fields) < 2 { return 0, 0, fmt.Errorf("invalid /proc/%d/stat", pid) } pgrp, err := strconv.Atoi(fields[1]) if err != nil { return 0, 0, err } return pgrp, state, nil } func (p *nativeProcess) reapProcessGroupChildren() { for { var ws syscall.WaitStatus var ru syscall.Rusage pid, err := syscall.Wait4(-p.pgid, &ws, waitTraceOptions|syscall.WNOHANG, &ru) if err == syscall.EINTR { continue } if err != nil || pid <= 0 { return } if p.tracer != nil { p.tracer.debugf("reaped leftover process-group child pid=%d status=%#x", pid, int(ws)) } } } func (p *nativeProcess) interruptThenKillAfter(grace time.Duration) { _ = p.Signal(os.Interrupt) time.Sleep(grace) if !p.isExited() { _ = p.Kill() } } func (p *nativeProcess) isExited() bool { p.mu.Lock() defer p.mu.Unlock() return p.exited } func (p *nativeProcess) closeChildPipes() { for _, c := range p.childClose { _ = c.Close() } p.childClose = nil } func (p *nativeProcess) closeParentInputPipes() { if p.stdinW != nil { _ = p.stdinW.Close() } } func (p *nativeProcess) closeParentOutputPipes() { if p.stdoutR != nil { _ = p.stdoutR.Close() } if p.stderrR != nil { _ = p.stderrR.Close() } } func (p *nativeProcess) closeAllPipes() { p.closeChildPipes() for _, c := range p.parentClose { _ = c.Close() } p.parentClose = nil } func (p *nativeProcess) Kill() error { if p.process == nil { return errProcessExited } if p.pgid > 0 { return syscall.Kill(-p.pgid, syscall.SIGKILL) } return p.process.Kill() } func (p *nativeProcess) Signal(sig os.Signal) error { if p.process == nil { return errProcessExited } s, ok := sig.(syscall.Signal) if !ok { return fmt.Errorf("unsupported signal %v", sig) } if p.pgid > 0 { return syscall.Kill(-p.pgid, s) } return p.process.Signal(sig) } func (p *nativeProcess) Wait() error { if p.done == nil { return errProcessExited } err := <-p.done p.done = nil return err } func (p *nativeProcess) ExitCode() (int, error) { p.mu.Lock() if p.exited { code := p.exitCode p.mu.Unlock() return code, nil } p.mu.Unlock() if err := p.Wait(); err != nil { return -1, err } p.mu.Lock() defer p.mu.Unlock() return p.exitCode, nil } type tracer struct { pm *pathMapper rootPID int rootCwd string uid uint32 gid uint32 debug bool tracees map[int]*traceeState pendingTracees map[int]*traceeState rootGone bool rootGoneSince time.Time lostTracee bool lostTraceeSince time.Time exitCode int exitCodeSet bool } type traceeState struct { pid int parentPID int inSyscall bool optionsSet bool cwd string scratch uint64 pending *pendingExit creds fakeCreds // Keep the unmodified syscall-entry registers until the corresponding // exit/SIGSYS. On ARM/ARM64 the first argument and return value share r0/x0, // so an emulated exit can otherwise destroy the argument needed by the // outer-seccomp handler. originalRegs syscall.PtraceRegs originalSysno uint64 originalValid bool } type pendingExit struct { kind string guestPath string sourcePath string targetPath string buf uint64 size uint64 fd uint64 } type fakeCreds struct { ruid, euid, suid uint32 rgid, egid, sgid uint32 fsuid, fsgid uint32 groups []uint32 } func newFakeCreds(uid, gid uint32) fakeCreds { return fakeCreds{ ruid: uid, euid: uid, suid: uid, fsuid: uid, rgid: gid, egid: gid, sgid: gid, fsgid: gid, groups: []uint32{gid}, } } func cloneFakeCreds(c fakeCreds) fakeCreds { out := c if c.groups != nil { out.groups = append([]uint32(nil), c.groups...) } return out } func noCredID(v uint32) bool { return v == ^uint32(0) } func (c fakeCreds) uidPrivileged() bool { return c.euid == 0 } func (c fakeCreds) gidPrivileged() bool { return c.euid == 0 } func (c fakeCreds) canUseUID(v uint32) bool { return noCredID(v) || c.uidPrivileged() || v == c.ruid || v == c.euid || v == c.suid } func (c fakeCreds) canUseGID(v uint32) bool { return noCredID(v) || c.gidPrivileged() || v == c.rgid || v == c.egid || v == c.sgid } func (c fakeCreds) canSetUID(vals ...uint32) bool { for _, v := range vals { if !c.canUseUID(v) { return false } } return true } func (c fakeCreds) canSetGID(vals ...uint32) bool { for _, v := range vals { if !c.canUseGID(v) { return false } } return true } func (t *tracer) fakeErrno(st *traceeState, regs *syscall.PtraceRegs, errno syscall.Errno) error { setSysno(regs, sc.getpid) st.pending = &pendingExit{kind: "errno", size: uint64(errno)} return setRegs(st.pid, regs) } func newTracer(pm *pathMapper, cwd string, uid, gid uint32) *tracer { return &tracer{ pm: pm, rootCwd: cleanGuestPath(cwd), uid: uid, gid: gid, debug: os.Getenv("PROOT_GO_DEBUG") == "1", tracees: map[int]*traceeState{}, pendingTracees: map[int]*traceeState{}, exitCode: -1, } } func (t *tracer) debugf(format string, args ...any) { if t.debug { fmt.Fprintf(os.Stderr, "proot-go: "+format+"\n", args...) } } func (t *tracer) addTracee(pid int, parent *traceeState, cwd string) *traceeState { st := &traceeState{pid: pid, cwd: cleanGuestPath(cwd), creds: newFakeCreds(t.uid, t.gid)} if parent != nil { st.parentPID = parent.pid st.cwd = parent.cwd st.creds = cloneFakeCreds(parent.creds) } t.tracees[pid] = st delete(t.pendingTracees, pid) return st } func (t *tracer) queueTracee(pid int, parent *traceeState) { if pid <= 0 { return } // A newly forked task can report its initial ptrace stop before the // parent's PTRACE_EVENT_FORK/CLONE is consumed. In that ordering the task // has already been adopted with the conservative cwd "/". Merge the // parent state when the event finally arrives instead of returning early; // otherwise only some members of a shell pipeline inherit chdir(), e.g. // `cd /usr/share/ca-certificates; find . | sort`. inheritParent := func(st *traceeState) { if parent == nil { return } st.parentPID = parent.pid st.cwd = parent.cwd st.creds = cloneFakeCreds(parent.creds) } if st, ok := t.tracees[pid]; ok { inheritParent(st) t.debugf("pid=%d merged late parent pid=%d cwd=%q", pid, st.parentPID, st.cwd) return } if st, ok := t.pendingTracees[pid]; ok { inheritParent(st) return } st := &traceeState{pid: pid, cwd: "/", creds: newFakeCreds(t.uid, t.gid)} inheritParent(st) t.pendingTracees[pid] = st } func (t *tracer) adoptTracee(pid int) *traceeState { if st := t.pendingTracees[pid]; st != nil { delete(t.pendingTracees, pid) t.tracees[pid] = st return st } return t.addTracee(pid, nil, "/") } func (t *tracer) loop(rootPID int) error { t.rootPID = rootPID var ws syscall.WaitStatus var ru syscall.Rusage pid, err := syscall.Wait4(rootPID, &ws, waitTraceOptions, &ru) if err != nil { return err } root := t.addTracee(pid, nil, t.rootCwd) if ws.Exited() || ws.Signaled() { t.setExit(ws) return t.waitErr(ws) } if err := setPtraceOptionsRetry(pid, true); err != nil { t.debugf("pid=%d initial ptrace options failed: %v", pid, err) _ = syscall.Kill(pid, syscall.SIGKILL) _ = syscall.PtraceCont(pid, int(syscall.SIGKILL)) return err } root.optionsSet = true if !t.resumeTracee(pid, 0) { return syscall.ESRCH } _ = root for len(t.tracees) > 0 { waitOptions := waitTraceOptions if t.rootGone || t.lostTracee { waitOptions |= syscall.WNOHANG } pid, err = syscall.Wait4(-1, &ws, waitOptions, &ru) if err == syscall.EINTR { continue } if pid == 0 { if t.rootGone { t.pruneDeadTracees() if len(t.tracees) == 0 { if !t.exitCodeSet { t.exitCodeSet = true t.exitCode = 0 } break } if time.Since(t.rootGoneSince) > 2*time.Second { t.debugf("root tracee gone; detaching remaining tracees after idle wait: %v", t.traceePids()) t.detachRemainingTracees() if !t.exitCodeSet { t.exitCodeSet = true t.exitCode = 0 } break } time.Sleep(20 * time.Millisecond) continue } if t.lostTracee { t.pruneDeadTracees() if time.Since(t.lostTraceeSince) > 500*time.Millisecond { t.debugf("idle after lost tracee; waking possible waiters: %v", t.traceePids()) t.signalTracees(syscall.SIGCHLD) t.lostTraceeSince = time.Now() } time.Sleep(20 * time.Millisecond) continue } continue } if err != nil { if len(t.tracees) == 0 || err == syscall.ECHILD { break } return err } st := t.tracees[pid] if st == nil { st = t.adoptTracee(pid) } if ws.Exited() || ws.Signaled() { if pid == rootPID { t.setExit(ws) } delete(t.tracees, pid) continue } if !ws.Stopped() { if !t.resumeTracee(pid, 0) { t.markRootGone(pid, rootPID) } continue } if !st.optionsSet { if err := setPtraceOptionsRetry(pid, pid == rootPID); err == nil { st.optionsSet = true } else { t.debugf("pid=%d ptrace options failed: %v", pid, err) if isProcessGoneErr(err) { t.markRootGone(pid, rootPID) continue } } } sig := ws.StopSignal() event := ptraceEvent(ws) if sig == syscall.SIGTRAP && event != 0 { t.handleEvent(pid, st, event) if !t.resumeTracee(pid, 0) { t.markRootGone(pid, rootPID) } continue } if isSyscallStop(ws) { if err := t.handleSyscall(st); err != nil { if isProcessGoneErr(err) { // Some kernels/reporting paths can return ESRCH from PTRACE_PEEKDATA // while a tracee is in the syscall-stop we are handling. Do not // treat that as a fatal translation error by itself: first try to // resume the original syscall so the real parent/children can finish // normally. This is important for apt/dpkg where the root apt // process can briefly report ESRCH while dpkg helpers are still // unpacking packages. if pid == rootPID { t.debugf("pid=%d root syscall translation saw process-gone error, trying resume: %v", pid, err) } else { t.debugf("pid=%d syscall translation saw process-gone error, trying resume: %v", pid, err) } if t.resumeTracee(pid, 0) { continue } t.markRootGone(pid, rootPID) continue } // Do not kill the tracee for a best-effort translation failure; // continue and let the kernel report the original error if possible. t.debugf("pid=%d syscall translation failed: %v", pid, err) } if !t.resumeTracee(pid, 0) { t.markRootGone(pid, rootPID) } continue } if sig == syscall.SIGSYS { handled, err := t.handleSeccompSIGSYS(st) if err != nil { if isProcessGoneErr(err) { t.markRootGone(pid, rootPID) continue } t.debugf("pid=%d failed handling seccomp SIGSYS: %v", pid, err) } else if handled { // SECCOMP_RET_TRAP skips the syscall and reports SIGSYS instead of // producing the normal syscall-exit stop. Suppress the signal and // continue at the instruction following the syscall. if !t.resumeTracee(pid, 0) { t.markRootGone(pid, rootPID) } continue } } // Forward real signals, but do not reinject ptrace's synthetic SIGTRAP/SIGSTOP. forward := int(sig) if sig == syscall.SIGTRAP || sig == syscall.SIGSTOP { forward = 0 } if !t.resumeTracee(pid, forward) { t.markRootGone(pid, rootPID) } } if t.exitCodeSet && t.exitCode != 0 { return fmt.Errorf("exit status %d", t.exitCode) } return nil } func (t *tracer) traceeGone(pid, rootPID int, where string, err error) { t.debugf("pid=%d disappeared %s: %v", pid, where, err) delete(t.tracees, pid) t.markRootGone(pid, rootPID) } func (t *tracer) markRootGone(pid, rootPID int) { if pid != rootPID { return } if processExists(pid) { t.terminateLostRoot(pid, rootPID) } if !t.rootGone { t.rootGone = true t.rootGoneSince = time.Now() } // Once the root tracee has disappeared there is no command owner left to // wait for. Keeping the tracer attached to leftover helpers is unsafe here: // apt/dpkg can leave transient method/sqv/extracttemplate children that have // already notified their real parent, while the tracer no longer has a root // process capable of driving the tree forward. Detach the remaining tracees // immediately so Wait() can return instead of spinning in WNOHANG forever. delete(t.tracees, pid) if len(t.tracees) > 0 { t.debugf("root tracee gone; detaching remaining tracees immediately: %v", t.traceePids()) t.detachRemainingTracees() } // This flag is what makes the main wait loop switch to WNOHANG + pruning. // Without it, a process-gone root can leave the tracer blocked forever in // wait4(-1, __WALL) if one of the queued/adopted helper PIDs never delivers // another wait status. The log that ends with: // // root syscall translation saw process-gone error, trying resume: no such process // disappeared before resume // // hits exactly this path. if !t.rootGone { t.rootGone = true t.rootGoneSince = time.Now() } // If wait4 never delivered the root's final status, fall back to a clean exit. // Real non-zero exits/signals still win when they were observed via setExit(). if !t.exitCodeSet { t.exitCodeSet = true t.exitCode = 0 } } func (t *tracer) pruneDeadTracees() { for pid := range t.tracees { if err := syscall.Kill(pid, 0); err != nil && errors.Is(err, syscall.ESRCH) { t.debugf("pid=%d disappeared while pruning stale tracees", pid) delete(t.tracees, pid) } } } func (t *tracer) terminateLostRoot(pid, pgid int) { t.debugf("root tracee pid=%d is alive but no longer ptrace-controllable; sending SIGINT to process group %d", pid, pgid) _ = syscall.Kill(-pgid, syscall.SIGINT) if err := syscall.PtraceCont(pid, int(syscall.SIGINT)); err != nil { t.debugf("pid=%d ptrace continue with SIGINT failed: %v", pid, err) } time.Sleep(500 * time.Millisecond) if !processGroupExists(pgid) { return } t.debugf("root process group %d still alive after SIGINT; sending SIGKILL", pgid) _ = syscall.Kill(-pgid, syscall.SIGKILL) if err := syscall.PtraceCont(pid, int(syscall.SIGKILL)); err != nil { t.debugf("pid=%d ptrace continue with SIGKILL failed: %v", pid, err) if err := syscall.PtraceDetach(pid); err != nil && !isProcessGoneErr(err) { t.debugf("pid=%d ptrace detach after SIGKILL failed: %v", pid, err) } } } func (t *tracer) traceePids() []int { pids := make([]int, 0, len(t.tracees)) for pid := range t.tracees { pids = append(pids, pid) } return pids } func (t *tracer) detachRemainingTracees() { for pid := range t.tracees { if err := syscall.PtraceDetach(pid); err != nil { if isProcessGoneErr(err) { t.debugf("pid=%d disappeared before detach", pid) } else { t.debugf("pid=%d ptrace detach failed: %v", pid, err) } } delete(t.tracees, pid) } } func (t *tracer) noteLostTracee(pid int) { if !t.lostTracee { t.lostTracee = true } t.lostTraceeSince = time.Now() } func (t *tracer) signalTracees(sig syscall.Signal) { for pid := range t.tracees { if err := syscall.Kill(pid, sig); err != nil && !isProcessGoneErr(err) { t.debugf("pid=%d signal %s failed: %v", pid, sig, err) } } } func (t *tracer) signalTraceeParent(pid int, sig syscall.Signal) { st := t.tracees[pid] if st == nil || st.parentPID == 0 { return } if _, ok := t.tracees[st.parentPID]; !ok { return } if err := syscall.Kill(st.parentPID, sig); err != nil { if !isProcessGoneErr(err) { t.debugf("pid=%d signal parent %d with %s failed: %v", pid, st.parentPID, sig, err) } return } t.debugf("pid=%d disappeared; sent %s to parent pid=%d", pid, sig, st.parentPID) } func (t *tracer) reapGoneTracee(pid int) { // A ptrace resume can return ESRCH after a tracee has already reached its // final wait state. In that case, the real parent can remain blocked in // waitpid() until the tracer consumes that pending ptrace status. Drain any // immediately available status for this pid before forgetting it. for { var ws syscall.WaitStatus var ru syscall.Rusage r, err := syscall.Wait4(pid, &ws, waitTraceOptions|syscall.WNOHANG, &ru) if r == 0 { return } if err != nil { if err != syscall.ECHILD && err != syscall.EINTR { t.debugf("pid=%d reap after ESRCH failed: %v", pid, err) } return } if r != pid { return } if ws.Exited() || ws.Signaled() { t.debugf("pid=%d reaped after ESRCH: status=%#x", pid, int(ws)) delete(t.tracees, pid) return } if ws.Stopped() { // If it was merely stopped, release it; otherwise the real parent may wait // forever for a helper that the tracer no longer intends to manage. sig := 0 if s := ws.StopSignal(); s != syscall.SIGTRAP && s != syscall.SIGSTOP { sig = int(s) } if err := syscall.PtraceSyscall(pid, sig); err != nil && !isProcessGoneErr(err) { t.debugf("pid=%d resume while reaping after ESRCH failed: %v", pid, err) } return } } } func setPtraceOptions(pid int, traceExit bool) error { options := syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACEFORK | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACECLONE | syscall.PTRACE_O_TRACEEXEC // Do not request PTRACE_O_TRACEEXIT for short-lived helper children. The // extra synthetic exit-stop can race with helpers such as `stty -a` spawned // by debconf: PTRACE_PEEKDATA/PTRACE_SYSCALL may report ESRCH before the // real parent observes completion, leaving apt/dpkg waiting for a child that // the tracer effectively consumed. The root tracee is different: enabling // TRACEEXIT only for it gives the tracer one last deterministic stop before // apt itself exits, avoiding the root "process-gone" path that can otherwise // make us terminate before reporting the real wait status. if traceExit { options |= syscall.PTRACE_O_TRACEEXIT } return syscall.PtraceSetOptions(pid, options) } func setPtraceOptionsRetry(pid int, traceExit bool) error { var err error for attempt := 0; attempt < 20; attempt++ { err = setPtraceOptions(pid, traceExit) if err == nil { return nil } if errors.Is(err, syscall.EINTR) { continue } if !isProcessGoneErr(err) || !processExists(pid) { return err } // Android kernels can briefly report ESRCH while the newly exec'd // tracee is transitioning into its first ptrace stop. The task still // exists and remains stopped, so retry the option write on the same // tracee instead of abandoning it. time.Sleep(5 * time.Millisecond) } return err } func isSyscallStop(ws syscall.WaitStatus) bool { return ws.Stopped() && int(ws.StopSignal()) == (int(syscall.SIGTRAP)|0x80) } func ptraceEvent(ws syscall.WaitStatus) int { return int(uint32(ws) >> 16) } func (t *tracer) handleEvent(pid int, st *traceeState, event int) { switch event { case syscall.PTRACE_EVENT_FORK, syscall.PTRACE_EVENT_VFORK, syscall.PTRACE_EVENT_CLONE: msg, err := syscall.PtraceGetEventMsg(pid) if err == nil && msg != 0 { child := int(msg) t.queueTracee(child, st) } case syscall.PTRACE_EVENT_EXIT: // Root-only TRACEEXIT is used as a deterministic pre-exit stop. Do not // set the final status here; resume and let the normal wait status report // the real exit code/signal. case syscall.PTRACE_EVENT_EXEC: // Keep the syscall entry/exit phase unchanged. With PTRACE_SYSCALL, // Linux may report PTRACE_EVENT_EXEC between execve-enter and // execve-exit. Resetting inSyscall here makes the following // execve-exit stop look like a new syscall-enter stop, which then // flips the phase for the freshly loaded dynamic linker. Once the // phase is inverted, the loader's openat() calls for the executable // itself are missed, producing errors such as: // // /usr/lib/apt/methods/http: error while loading shared libraries: // /usr/lib/apt/methods/http: cannot open shared object file // // The normal syscall-exit stop will clear st.inSyscall. } } func (t *tracer) setExit(ws syscall.WaitStatus) { t.exitCodeSet = true if ws.Exited() { t.exitCode = ws.ExitStatus() } else if ws.Signaled() { t.exitCode = 128 + int(ws.Signal()) } else { t.exitCode = -1 } } func (t *tracer) waitErr(ws syscall.WaitStatus) error { if ws.Exited() && ws.ExitStatus() == 0 { return nil } if ws.Exited() { return fmt.Errorf("exit status %d", ws.ExitStatus()) } if ws.Signaled() { return fmt.Errorf("process killed by %s", ws.Signal()) } return nil } func isProcessGoneErr(err error) bool { if err == nil { return false } return errors.Is(err, syscall.ESRCH) || strings.Contains(err.Error(), "no such process") } func processExists(pid int) bool { err := syscall.Kill(pid, 0) return err == nil || errors.Is(err, syscall.EPERM) } func (t *tracer) resumeTracee(pid int, sig int) bool { if err := syscall.PtraceSyscall(pid, sig); err != nil { if isProcessGoneErr(err) { if t.recoverTracee(pid, sig) { return true } t.debugf("pid=%d disappeared before resume", pid) t.reapGoneTracee(pid) t.signalTraceeParent(pid, syscall.SIGCHLD) t.noteLostTracee(pid) } else { t.debugf("pid=%d ptrace resume failed: %v", pid, err) } delete(t.tracees, pid) return false } return true } func (t *tracer) recoverTracee(pid int, sig int) bool { if !processExists(pid) { return false } t.debugf("pid=%d still exists after ptrace ESRCH; trying PTRACE_ATTACH recovery", pid) if err := syscall.PtraceAttach(pid); err != nil { t.debugf("pid=%d ptrace attach recovery failed: %v", pid, err) return false } var ws syscall.WaitStatus var ru syscall.Rusage for { r, err := syscall.Wait4(pid, &ws, waitTraceOptions, &ru) if err == syscall.EINTR { continue } if err != nil { t.debugf("pid=%d wait after ptrace attach recovery failed: %v", pid, err) _ = syscall.PtraceDetach(pid) return false } if r == pid { break } } if !ws.Stopped() { t.debugf("pid=%d was not stopped after ptrace attach recovery: status=%#x", pid, int(ws)) _ = syscall.PtraceDetach(pid) return false } st := t.tracees[pid] if st == nil { st = t.adoptTracee(pid) } st.inSyscall = false st.pending = nil if err := setPtraceOptions(pid, pid == t.rootPID); err != nil { t.debugf("pid=%d ptrace option recovery failed: %v", pid, err) _ = syscall.PtraceDetach(pid) return false } st.optionsSet = true if err := syscall.PtraceSyscall(pid, sig); err != nil { t.debugf("pid=%d ptrace resume after recovery failed: %v", pid, err) _ = syscall.PtraceDetach(pid) return false } t.debugf("pid=%d ptrace recovery succeeded", pid) return true } // isLocallyEmulatedSyscall reports whether onSyscallEnter replaces the // syscall with a harmless host call and synthesizes its guest-visible result. // An outer Android seccomp filter may still report SIGSYS for the original // syscall after that synthetic exit has already been processed. In that case // the result must be preserved instead of being overwritten with ENOSYS. func isLocallyEmulatedSyscall(nr uint64) bool { switch nr { case sc.chown, sc.lchown, sc.fchown, sc.fchownat, sc.getuid, sc.geteuid, sc.getgid, sc.getegid, sc.getresuid, sc.getresgid, sc.getgroups, sc.setuid, sc.setgid, sc.setreuid, sc.setregid, sc.setresuid, sc.setresgid, sc.setgroups, sc.setfsuid, sc.setfsgid, sc.chroot: return nr != noSys default: return false } } func (t *tracer) handleSeccompSIGSYS(st *traceeState) (bool, error) { info, err := ptraceGetSeccompSiginfo(st.pid) if err != nil { return false, err } if info.Code != sysSeccompCode { t.debugf("pid=%d received non-seccomp SIGSYS: si_code=%d", st.pid, info.Code) return false, nil } regs, err := getRegs(st.pid) if err != nil { return false, err } trappedSysno := uint64(uint32(info.Syscall)) logRegs := regs if st.originalValid && st.originalSysno == trappedSysno { // This is the Go equivalent of PRoot's ORIGINAL register snapshot. // In particular, it retains x0/r0 before a synthesized sysexit writes // the return value into the same register. logRegs = &st.originalRegs } name := "syscall" if trappedSysno == sc.setRobust { name = "set_robust_list" } // Some Android policies trap link/linkat instead of returning an errno. // The translated host paths were retained at syscall-entry, so perform the // same backup-copy fallback here and suppress the SIGSYS. if st.originalValid && st.originalSysno == trappedSysno && st.pending != nil && st.pending.kind == "hardlink" { p := st.pending if err := emulateHardLinkFallback(p.sourcePath, p.targetPath, p.size); err != nil { setRetval(regs, -int64(errnoFromError(err))) t.debugf("pid=%d seccomp hardlink fallback failed source=%q target=%q: %v", st.pid, p.sourcePath, p.targetPath, err) } else { setRetval(regs, 0) t.debugf("pid=%d seccomp hardlink fallback copied source=%q target=%q", st.pid, p.sourcePath, p.targetPath) } st.inSyscall = false st.pending = nil st.originalValid = false if err := setRegs(st.pid, regs); err != nil { return false, err } return true, nil } // When the syscall was already emulated by onSyscallEnter, the ordinary // syscall-exit handler may have run before this outer-seccomp SIGSYS. This // is the ordering handled by termux/proot's synthesized-sysexit path. Do not // replace the emulated success/error with ENOSYS. If the exit handler has // not run yet, run it now from the saved pending operation. if st.originalValid && st.originalSysno == trappedSysno && isLocallyEmulatedSyscall(trappedSysno) { if st.pending != nil { st.inSyscall = false if err := t.onSyscallExit(st, regs); err != nil { return false, err } t.debugf("pid=%d synthesized syscall exit for emulated syscall=%d before suppressing SIGSYS", st.pid, trappedSysno) } else { t.debugf("pid=%d preserving completed emulation for syscall=%d while suppressing SIGSYS", st.pid, trappedSysno) } st.inSyscall = false st.pending = nil st.originalValid = false return true, nil } t.debugf("pid=%d handling seccomp SIGSYS for %s=%d as ENOSYS (inSyscall=%t args=%d,%d,%d,%d,%d,%d)", st.pid, name, trappedSysno, st.inSyscall, arg(logRegs, 0), arg(logRegs, 1), arg(logRegs, 2), arg(logRegs, 3), arg(logRegs, 4), arg(logRegs, 5)) // The blocked syscall has no ordinary exit stop to process. Reset the // entry/exit state before resuming, discard any result rewrite associated // with that call, and expose the same ENOSYS fallback used by PRoot for // set_robust_list and unsupported outer-seccomp calls such as rseq. st.inSyscall = false st.pending = nil st.originalValid = false setRetval(regs, -int64(syscall.ENOSYS)) if err := setRegs(st.pid, regs); err != nil { return false, err } return true, nil } func (t *tracer) handleSyscall(st *traceeState) error { regs, err := getRegs(st.pid) if err != nil { return err } if !st.inSyscall { st.inSyscall = true st.scratch = stackPtr(regs) - stackScratchGap st.originalRegs = *regs st.originalSysno = sysno(regs) st.originalValid = true return t.onSyscallEnter(st, regs) } st.inSyscall = false // Keep ORIGINAL until the next syscall entry. Some Android kernels report // the outer-seccomp SIGSYS after the synthesized syscall-exit stop; clearing // it here would lose x0/r0 before the SIGSYS handler can inspect it. return t.onSyscallExit(st, regs) } func (t *tracer) onSyscallEnter(st *traceeState, regs *syscall.PtraceRegs) error { nr := sysno(regs) st.pending = nil switch nr { case sc.execve: return t.translateExecve(st, regs, 0, 1, atFDCWD) case sc.execveat: dirfd := int(int64(arg(regs, 0))) return t.translateExecve(st, regs, 1, 2, dirfd) case sc.open: return t.translateArgPathMode(st, regs, 0, atFDCWD, pathModeFromOpenFlags(arg(regs, 1))) case sc.openat: dirfd := int(int64(arg(regs, 0))) return t.translateArgPathMode(st, regs, 1, dirfd, pathModeFromOpenFlags(arg(regs, 2))) case sc.openat2: dirfd := int(int64(arg(regs, 0))) return t.translateArgPathMode(st, regs, 1, dirfd, t.pathModeFromOpenHow(st, arg(regs, 2))) case sc.access, sc.stat, sc.lstat, sc.statfs, sc.statfs64: return t.translateArgPath(st, regs, 0, atFDCWD) case sc.chmod, sc.truncate, sc.utime, sc.utimes: return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathWrite) case sc.rmdir: return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathDeleteDir) case sc.chown, sc.lchown, sc.fchown, sc.fchownat: setSysno(regs, sc.getpid) st.pending = &pendingExit{kind: "fakezero"} return setRegs(st.pid, regs) case sc.faccessat, sc.faccessat2, sc.newfstatat, sc.fstatat, sc.statx: dirfd := int(int64(arg(regs, 0))) return t.translateArgPath(st, regs, 1, dirfd) case sc.mkdirat, sc.mknodat: dirfd := int(int64(arg(regs, 0))) return t.translateArgPathMode(st, regs, 1, dirfd, prootext.PathCreate) case sc.unlinkat: dirfd := int(int64(arg(regs, 0))) mode := prootext.PathDelete if arg(regs, 2)&0x200 != 0 { mode = prootext.PathDeleteDir } return t.translateArgPathMode(st, regs, 1, dirfd, mode) case sc.fchmodat, sc.utimensat: dirfd := int(int64(arg(regs, 0))) return t.translateArgPathMode(st, regs, 1, dirfd, prootext.PathWrite) case sc.mkdir, sc.mknod: return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathCreate) case sc.unlink: return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathDelete) case sc.rename: if err := t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathWrite); err != nil { return err } return t.translateArgPathMode(st, regs, 1, atFDCWD, prootext.PathCreate) case sc.renameat, sc.renameat2: oldfd := int(int64(arg(regs, 0))) newfd := int(int64(arg(regs, 2))) if err := t.translateArgPathMode(st, regs, 1, oldfd, prootext.PathWrite); err != nil { return err } return t.translateArgPathMode(st, regs, 3, newfd, prootext.PathCreate) case sc.link: return t.translateHardLink(st, regs, 0, atFDCWD, 1, atFDCWD, 0) case sc.linkat: oldfd := int(int64(arg(regs, 0))) newfd := int(int64(arg(regs, 2))) return t.translateHardLink(st, regs, 1, oldfd, 3, newfd, arg(regs, 4)) case sc.symlink: if err := t.translateSymlinkTarget(st, regs, 0); err != nil { return err } return t.translateArgPathMode(st, regs, 1, atFDCWD, prootext.PathCreate) case sc.symlinkat: if err := t.translateSymlinkTarget(st, regs, 0); err != nil { return err } dirfd := int(int64(arg(regs, 1))) return t.translateArgPathMode(st, regs, 2, dirfd, prootext.PathCreate) case sc.readlink: guest, err := t.translateArgPathReturnGuest(st, regs, 0, atFDCWD) if err != nil { return err } st.pending = &pendingExit{kind: "readlink", guestPath: guest, buf: arg(regs, 1), size: arg(regs, 2)} return nil case sc.readlinkat: dirfd := int(int64(arg(regs, 0))) guest, err := t.translateArgPathReturnGuest(st, regs, 1, dirfd) if err != nil { return err } st.pending = &pendingExit{kind: "readlink", guestPath: guest, buf: arg(regs, 2), size: arg(regs, 3)} return nil case sc.chdir: guest, err := t.translateArgPathReturnGuest(st, regs, 0, atFDCWD) if err != nil { return err } st.pending = &pendingExit{kind: "chdir", guestPath: guest} return nil case sc.fchdir: st.pending = &pendingExit{kind: "fchdir", fd: arg(regs, 0)} return nil case sc.getcwd: st.pending = &pendingExit{kind: "getcwd", buf: arg(regs, 0), size: arg(regs, 1)} return nil case sc.getuid: st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.ruid)} return nil case sc.geteuid: st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.euid)} return nil case sc.getgid: st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.rgid)} return nil case sc.getegid: st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.egid)} return nil case sc.getresuid: st.pending = &pendingExit{kind: "getresuid", buf: arg(regs, 0), size: arg(regs, 1), fd: arg(regs, 2)} return nil case sc.getresgid: st.pending = &pendingExit{kind: "getresgid", buf: arg(regs, 0), size: arg(regs, 1), fd: arg(regs, 2)} return nil case sc.getgroups: st.pending = &pendingExit{kind: "getgroups", size: arg(regs, 0), buf: arg(regs, 1)} return nil case sc.setuid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) uid := uint32(arg(regs, 0)) if !st.creds.canUseUID(uid) { t.debugf("pid=%d deny setuid(%d): r/e/s=%d/%d/%d", st.pid, uid, st.creds.ruid, st.creds.euid, st.creds.suid) return t.fakeErrno(st, regs, syscall.EPERM) } kind := "setuid" if !st.creds.uidPrivileged() { kind = "seteuid" } st.pending = &pendingExit{kind: kind, size: uint64(uid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setgid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) gid := uint32(arg(regs, 0)) if !st.creds.canUseGID(gid) { t.debugf("pid=%d deny setgid(%d): r/e/s=%d/%d/%d", st.pid, gid, st.creds.rgid, st.creds.egid, st.creds.sgid) return t.fakeErrno(st, regs, syscall.EPERM) } kind := "setgid" if !st.creds.gidPrivileged() { kind = "setegid" } st.pending = &pendingExit{kind: kind, size: uint64(gid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setreuid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) ruid, euid := uint32(arg(regs, 0)), uint32(arg(regs, 1)) if !st.creds.canSetUID(ruid, euid) { t.debugf("pid=%d deny setreuid(%d,%d): r/e/s=%d/%d/%d", st.pid, ruid, euid, st.creds.ruid, st.creds.euid, st.creds.suid) return t.fakeErrno(st, regs, syscall.EPERM) } st.pending = &pendingExit{kind: "setreuid", buf: uint64(ruid), size: uint64(euid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setregid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) rgid, egid := uint32(arg(regs, 0)), uint32(arg(regs, 1)) if !st.creds.canSetGID(rgid, egid) { t.debugf("pid=%d deny setregid(%d,%d): r/e/s=%d/%d/%d", st.pid, rgid, egid, st.creds.rgid, st.creds.egid, st.creds.sgid) return t.fakeErrno(st, regs, syscall.EPERM) } st.pending = &pendingExit{kind: "setregid", buf: uint64(rgid), size: uint64(egid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setresuid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) ruid, euid, suid := uint32(arg(regs, 0)), uint32(arg(regs, 1)), uint32(arg(regs, 2)) if !st.creds.canSetUID(ruid, euid, suid) { t.debugf("pid=%d deny setresuid(%d,%d,%d): r/e/s=%d/%d/%d", st.pid, ruid, euid, suid, st.creds.ruid, st.creds.euid, st.creds.suid) return t.fakeErrno(st, regs, syscall.EPERM) } st.pending = &pendingExit{kind: "setresuid", buf: uint64(ruid), size: uint64(euid), fd: uint64(suid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setresgid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) rgid, egid, sgid := uint32(arg(regs, 0)), uint32(arg(regs, 1)), uint32(arg(regs, 2)) if !st.creds.canSetGID(rgid, egid, sgid) { t.debugf("pid=%d deny setresgid(%d,%d,%d): r/e/s=%d/%d/%d", st.pid, rgid, egid, sgid, st.creds.rgid, st.creds.egid, st.creds.sgid) return t.fakeErrno(st, regs, syscall.EPERM) } st.pending = &pendingExit{kind: "setresgid", buf: uint64(rgid), size: uint64(egid), fd: uint64(sgid)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setgroups: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) if !st.creds.gidPrivileged() { t.debugf("pid=%d deny setgroups: euid=%d", st.pid, st.creds.euid) return t.fakeErrno(st, regs, syscall.EPERM) } groups, err := readTraceeUint32Array(st.pid, uintptr(arg(regs, 1)), arg(regs, 0)) if err != nil { // setgroups() is part of apt's privilege drop. If the group vector // cannot be read because the tracee is between exec mappings or uses a // special pointer, still emulate success instead of letting the real // unprivileged syscall run and fail with EPERM. t.debugf("pid=%d failed reading setgroups vector: %v", st.pid, err) groups = nil } st.creds.groups = groups setSysno(regs, sc.getpid) st.pending = &pendingExit{kind: "fakezero"} return setRegs(st.pid, regs) case sc.setfsuid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) old := st.creds.fsuid uid := uint32(arg(regs, 0)) if st.creds.canUseUID(uid) { st.creds.fsuid = uid } st.pending = &pendingExit{kind: "ret", size: uint64(old)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.setfsgid: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) old := st.creds.fsgid gid := uint32(arg(regs, 0)) if st.creds.canUseGID(gid) { st.creds.fsgid = gid } st.pending = &pendingExit{kind: "ret", size: uint64(old)} setSysno(regs, sc.getpid) return setRegs(st.pid, regs) case sc.chroot: t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2)) setSysno(regs, sc.getpid) st.pending = &pendingExit{kind: "fakezero"} return setRegs(st.pid, regs) } return nil } func (t *tracer) onSyscallExit(st *traceeState, regs *syscall.PtraceRegs) error { if st.pending == nil { return nil } p := st.pending st.pending = nil retv := retval(regs) switch p.kind { case "hardlink": if retv == 0 { return nil } if !shouldFallbackHardLink(retv) { return nil } if err := emulateHardLinkFallback(p.sourcePath, p.targetPath, p.size); err != nil { t.debugf("pid=%d hardlink fallback failed source=%q target=%q: %v", st.pid, p.sourcePath, p.targetPath, err) setRetval(regs, -int64(errnoFromError(err))) return setRegs(st.pid, regs) } t.debugf("pid=%d hardlink fallback copied source=%q target=%q after native errno=%d", st.pid, p.sourcePath, p.targetPath, -retv) setRetval(regs, 0) return setRegs(st.pid, regs) case "chdir": if retv == 0 { st.cwd = cleanGuestPath(p.guestPath) } case "fchdir": if retv == 0 { if guest, ok := t.fdGuestPath(st.pid, int(p.fd)); ok { st.cwd = cleanGuestPath(guest) } } case "ret": setRetval(regs, int64(uint32(p.size))) return setRegs(st.pid, regs) case "errno": setRetval(regs, -int64(syscall.Errno(p.size))) return setRegs(st.pid, regs) case "fakezero": setRetval(regs, 0) return setRegs(st.pid, regs) case "setuid": uid := uint32(p.size) st.creds.ruid, st.creds.euid, st.creds.suid, st.creds.fsuid = uid, uid, uid, uid setRetval(regs, 0) return setRegs(st.pid, regs) case "seteuid": uid := uint32(p.size) st.creds.euid, st.creds.fsuid = uid, uid setRetval(regs, 0) return setRegs(st.pid, regs) case "setgid": gid := uint32(p.size) st.creds.rgid, st.creds.egid, st.creds.sgid, st.creds.fsgid = gid, gid, gid, gid setRetval(regs, 0) return setRegs(st.pid, regs) case "setegid": gid := uint32(p.size) st.creds.egid, st.creds.fsgid = gid, gid setRetval(regs, 0) return setRegs(st.pid, regs) case "setreuid": if v := uint32(p.buf); v != ^uint32(0) { st.creds.ruid = v } if v := uint32(p.size); v != ^uint32(0) { st.creds.euid = v st.creds.fsuid = v } st.creds.suid = st.creds.euid setRetval(regs, 0) return setRegs(st.pid, regs) case "setregid": if v := uint32(p.buf); v != ^uint32(0) { st.creds.rgid = v } if v := uint32(p.size); v != ^uint32(0) { st.creds.egid = v st.creds.fsgid = v } st.creds.sgid = st.creds.egid setRetval(regs, 0) return setRegs(st.pid, regs) case "setresuid": if v := uint32(p.buf); v != ^uint32(0) { st.creds.ruid = v } if v := uint32(p.size); v != ^uint32(0) { st.creds.euid = v st.creds.fsuid = v } if v := uint32(p.fd); v != ^uint32(0) { st.creds.suid = v } setRetval(regs, 0) return setRegs(st.pid, regs) case "setresgid": if v := uint32(p.buf); v != ^uint32(0) { st.creds.rgid = v } if v := uint32(p.size); v != ^uint32(0) { st.creds.egid = v st.creds.fsgid = v } if v := uint32(p.fd); v != ^uint32(0) { st.creds.sgid = v } setRetval(regs, 0) return setRegs(st.pid, regs) case "getresuid": _ = writeTraceeUint32(st.pid, uintptr(p.buf), st.creds.ruid) _ = writeTraceeUint32(st.pid, uintptr(p.size), st.creds.euid) _ = writeTraceeUint32(st.pid, uintptr(p.fd), st.creds.suid) setRetval(regs, 0) return setRegs(st.pid, regs) case "getresgid": _ = writeTraceeUint32(st.pid, uintptr(p.buf), st.creds.rgid) _ = writeTraceeUint32(st.pid, uintptr(p.size), st.creds.egid) _ = writeTraceeUint32(st.pid, uintptr(p.fd), st.creds.sgid) setRetval(regs, 0) return setRegs(st.pid, regs) case "getgroups": if p.size == 0 { setRetval(regs, int64(len(st.creds.groups))) return setRegs(st.pid, regs) } if p.size < uint64(len(st.creds.groups)) { setRetval(regs, -int64(syscall.EINVAL)) return setRegs(st.pid, regs) } for i, g := range st.creds.groups { _ = writeTraceeUint32(st.pid, uintptr(p.buf)+uintptr(i*4), g) } setRetval(regs, int64(len(st.creds.groups))) return setRegs(st.pid, regs) case "getcwd": if p.size == 0 { setRetval(regs, -int64(syscall.ERANGE)) return setRegs(st.pid, regs) } data := append([]byte(cleanGuestPath(st.cwd)), 0) if uint64(len(data)) > p.size { setRetval(regs, -int64(syscall.ERANGE)) return setRegs(st.pid, regs) } if _, err := syscall.PtracePokeData(st.pid, uintptr(p.buf), data); err != nil { return err } setRetval(regs, int64(len(data))) return setRegs(st.pid, regs) case "readlink": if retv <= 0 { return nil } buf := make([]byte, retv) if _, err := syscall.PtracePeekData(st.pid, uintptr(p.buf), buf); err != nil { return nil } target := string(buf) if strings.HasPrefix(target, t.pm.root) || filepath.IsAbs(target) { guest := t.pm.HostToGuest(target) if guest != target && guest != "" { out := []byte(guest) if uint64(len(out)) > p.size { out = out[:p.size] } if _, err := syscall.PtracePokeData(st.pid, uintptr(p.buf), out); err == nil { setRetval(regs, int64(len(out))) return setRegs(st.pid, regs) } } } } return nil } func (t *tracer) translateSymlinkTarget(st *traceeState, regs *syscall.PtraceRegs, n int) error { addr := arg(regs, n) if addr == 0 { return nil } old, err := readTraceeString(st.pid, uintptr(addr), 4096) if err != nil || old == "" { return err } // The link name is translated separately. The link *target*, however, is // stored verbatim by the kernel. If a guest creates an absolute symlink such // as /tmp/apt-dpkg-install/00-foo.deb -> /var/cache/apt/archives/foo.deb and // we leave the target unchanged, the host kernel later follows it outside the // rootfs and dpkg sees ENOENT. Store the host target for absolute links, and // translate it back on readlink() exit so guest-visible semantics remain // /var/cache/apt/archives/foo.deb. if !filepath.IsAbs(old) { return nil } guest, host, special := t.translateSpecialProcPath(st, "/", old) if !special { guest, host = t.pm.Translate("/", old) } t.debugf("pid=%d symlink-target old=%q guest=%q host=%q", st.pid, old, guest, host) newAddr, err := writeTraceeString(st, uintptr(addr), old, host) if err != nil { return err } setArg(regs, n, uint64(newAddr)) return setRegs(st.pid, regs) } const ( atSymlinkFollow = 0x400 atEmptyPath = 0x1000 ) // translateHardLink rewrites both link/linkat path arguments and retains the // translated host paths until syscall-exit. Android kernels and filesystems can // reject otherwise valid hard links from an unprivileged tracee; dpkg relies on // those links for rollback files, so the exit handler can create a regular copy // when the native operation fails with a policy/filesystem error. func (t *tracer) translateHardLink(st *traceeState, regs *syscall.PtraceRegs, oldArg, oldDirfd, newArg, newDirfd int, flags uint64) error { if err := t.translateArgPath(st, regs, oldArg, oldDirfd); err != nil { return err } source, err := readTraceeString(st.pid, uintptr(arg(regs, oldArg)), 4096) if err != nil { return err } if source == "" && flags&atEmptyPath != 0 && oldDirfd >= 0 { source, err = os.Readlink(fmt.Sprintf("/proc/%d/fd/%d", st.pid, oldDirfd)) if err != nil { return err } } if err := t.translateArgPathMode(st, regs, newArg, newDirfd, prootext.PathCreate); err != nil { return err } target, err := readTraceeString(st.pid, uintptr(arg(regs, newArg)), 4096) if err != nil { return err } st.pending = &pendingExit{ kind: "hardlink", sourcePath: source, targetPath: target, size: flags, } return nil } func shouldFallbackHardLink(retv int64) bool { if retv >= 0 { return false } switch syscall.Errno(-retv) { case syscall.EPERM, syscall.EACCES, syscall.EXDEV, syscall.ENOSYS, syscall.EOPNOTSUPP: return true default: return false } } // emulateHardLinkFallback implements the backup semantics needed by dpkg when // the host rejects link/linkat. A regular file is copied with O_EXCL so the // operation keeps link(2)'s no-overwrite behavior. Symlinks are recreated when // linkat was not asked to follow them. This is deliberately a fallback: native // hard links remain the primary path and retain full inode-sharing semantics. func emulateHardLinkFallback(source, target string, flags uint64) (retErr error) { info, err := os.Lstat(source) if err != nil { return err } if info.IsDir() { return syscall.EPERM } if info.Mode()&os.ModeSymlink != 0 && flags&atSymlinkFollow == 0 { value, err := os.Readlink(source) if err != nil { return err } return os.Symlink(value, target) } if info.Mode()&os.ModeSymlink != 0 { info, err = os.Stat(source) if err != nil { return err } } if !info.Mode().IsRegular() { return syscall.EPERM } src, err := os.Open(source) if err != nil { return err } defer src.Close() dst, err := os.OpenFile(target, os.O_WRONLY|os.O_CREATE|os.O_EXCL, info.Mode().Perm()) if err != nil { return err } created := true defer func() { if closeErr := dst.Close(); retErr == nil && closeErr != nil { retErr = closeErr } if retErr != nil && created { _ = os.Remove(target) } }() if _, err := io.Copy(dst, src); err != nil { return err } if err := dst.Sync(); err != nil { return err } if err := dst.Chmod(info.Mode().Perm()); err != nil { return err } created = false return nil } func errnoFromError(err error) syscall.Errno { var errno syscall.Errno if errors.As(err, &errno) { return errno } return syscall.EIO } func (t *tracer) translateArgPath(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int) error { _, err := t.translateArgPathReturnGuestMode(st, regs, n, dirfd, prootext.PathRead) return err } func (t *tracer) translateArgPathMode(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int, mode prootext.PathMode) error { _, err := t.translateArgPathReturnGuestMode(st, regs, n, dirfd, mode) return err } func (t *tracer) translateArgPathReturnGuest(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int) (string, error) { return t.translateArgPathReturnGuestMode(st, regs, n, dirfd, prootext.PathRead) } func (t *tracer) translateArgPathReturnGuestMode(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int, mode prootext.PathMode) (string, error) { addr := arg(regs, n) if addr == 0 { return "", nil } old, err := readTraceeString(st.pid, uintptr(addr), 4096) if err != nil { return "", err } if old == "" { return "", nil } cwd := st.cwd if dirfd != atFDCWD && !strings.HasPrefix(old, "/") { if guest, ok := t.fdGuestPath(st.pid, dirfd); ok { cwd = guest } } guest, host, special := t.translateSpecialProcPath(st, cwd, old) if !special { guest, host = t.pm.TranslateMode(cwd, old, mode) } t.debugf("pid=%d path mode=%d cwd=%q old=%q guest=%q host=%q", st.pid, mode, cwd, old, guest, host) newAddr, err := writeTraceeString(st, uintptr(addr), old, host) if err != nil { return guest, err } setArg(regs, n, uint64(newAddr)) return guest, setRegs(st.pid, regs) } func pathModeFromOpenFlags(flags uint64) prootext.PathMode { const accessMode = uint64(os.O_RDONLY | os.O_WRONLY | os.O_RDWR) switch flags & accessMode { case uint64(os.O_WRONLY), uint64(os.O_RDWR): if flags&uint64(os.O_CREATE|os.O_TRUNC) != 0 { return prootext.PathCreate } return prootext.PathWrite default: return prootext.PathRead } } func (t *tracer) pathModeFromOpenHow(st *traceeState, addr uint64) prootext.PathMode { flags, err := readTraceeUint64(st.pid, uintptr(addr)) if err != nil { return prootext.PathWrite } return pathModeFromOpenFlags(flags) } func (t *tracer) fdGuestPath(pid int, fd int) (string, bool) { if fd < 0 { return "", false } link := fmt.Sprintf("/proc/%d/fd/%d", pid, fd) target, err := os.Readlink(link) if err != nil { return "", false } return t.pm.HostToGuest(target), true } func (t *tracer) translateSpecialProcPath(st *traceeState, cwd, p string) (guest, host string, ok bool) { guest = joinGuest(cwd, p) // /proc is intentionally not bound by default because walking the host // procfs from commands such as `find /` races disappearing tasks. Some // programs, however, require the process-local procfs aliases, especially // apt/dpkg probing /proc/self/fd. Provide just those aliases without // exposing the entire host /proc tree as a guest bind. if guest == "/proc/self" || strings.HasPrefix(guest, "/proc/self/") { suffix := strings.TrimPrefix(guest, "/proc/self") return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), suffix)), true } if guest == "/proc/thread-self" || strings.HasPrefix(guest, "/proc/thread-self/") { suffix := strings.TrimPrefix(guest, "/proc/thread-self") return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), "task", strconv.Itoa(st.pid), suffix)), true } // If the tracee uses the real PID returned by getpid(), keep self-like // entries usable even with DefaultBinds not containing /proc. Do not map // arbitrary /proc/ paths; that would reintroduce the host procfs // traversal problem this backend tries to avoid by default. pidPrefix := "/proc/" + strconv.Itoa(st.pid) if guest == pidPrefix || strings.HasPrefix(guest, pidPrefix+"/") { suffix := strings.TrimPrefix(guest, pidPrefix) switch { case suffix == "", suffix == "/fd", strings.HasPrefix(suffix, "/fd/"), suffix == "/fdinfo", strings.HasPrefix(suffix, "/fdinfo/"), suffix == "/cwd", suffix == "/exe", suffix == "/root", suffix == "/status": return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), suffix)), true } } return "", "", false } func readTraceeString(pid int, addr uintptr, max int) (string, error) { if addr == 0 { return "", nil } var out []byte buf := make([]byte, 256) for len(out) < max { n, err := syscall.PtracePeekData(pid, addr+uintptr(len(out)), buf) if err != nil { return "", err } for i := 0; i < n; i++ { if buf[i] == 0 { return string(out), nil } out = append(out, buf[i]) if len(out) >= max { break } } } return string(out), nil } func writeTraceeString(st *traceeState, oldAddr uintptr, old, new string) (uintptr, error) { data := append([]byte(new), 0) addr := uintptr(st.scratch - uint64(len(data)+16)) addr &^= uintptr(15) st.scratch = uint64(addr) _, err := syscall.PtracePokeData(st.pid, addr, data) return addr, err } func (t *tracer) translateExecve(st *traceeState, regs *syscall.PtraceRegs, pathArg, argvArg int, dirfd int) error { pathAddr := arg(regs, pathArg) if pathAddr == 0 { return nil } old, err := readTraceeString(st.pid, uintptr(pathAddr), 4096) if err != nil || old == "" { return err } cwd := st.cwd if dirfd != atFDCWD && !strings.HasPrefix(old, "/") { if guest, ok := t.fdGuestPath(st.pid, dirfd); ok { cwd = guest } } guest, host, special := t.translateSpecialProcPath(st, cwd, old) if !special { guest, host = t.pm.Translate(cwd, old) } t.debugf("pid=%d exec cwd=%q old=%q guest=%q host=%q", st.pid, cwd, old, guest, host) argv, _ := readTraceeStringVector(st.pid, uintptr(arg(regs, argvArg)), 4096) if len(argv) == 0 { argv = []string{old} } if rw, changed := t.pm.resolveExec(host, guest, argv); changed { t.debugf("pid=%d exec rewrite execPath=%q argv=%q", st.pid, rw.ExecPath, strings.Join(rw.Argv, " ")) newPath, err := writeScratchString(st, rw.ExecPath) if err != nil { return err } argvPtr, err := writeTraceeStringVector(st, rw.Argv) if err != nil { return err } setArg(regs, pathArg, uint64(newPath)) setArg(regs, argvArg, uint64(argvPtr)) return setRegs(st.pid, regs) } newAddr, err := writeTraceeString(st, uintptr(pathAddr), old, host) if err != nil { return err } setArg(regs, pathArg, uint64(newAddr)) return setRegs(st.pid, regs) } func writeTraceeUint32(pid int, addr uintptr, v uint32) error { if addr == 0 { return nil } data := make([]byte, 4) putTraceeUint32(data, v) _, err := syscall.PtracePokeData(pid, addr, data) return err } func readTraceeUint32(pid int, addr uintptr) (uint32, error) { buf := make([]byte, 4) if _, err := syscall.PtracePeekData(pid, addr, buf); err != nil { return 0, err } return traceeUint32(buf), nil } func readTraceeUint64(pid int, addr uintptr) (uint64, error) { buf := make([]byte, 8) if _, err := syscall.PtracePeekData(pid, addr, buf); err != nil { return 0, err } if nativeLittleEndian { var v uint64 for i := range buf { v |= uint64(buf[i]) << (8 * uint(i)) } return v, nil } var v uint64 for i := range buf { v = (v << 8) | uint64(buf[i]) } return v, nil } func readTraceeUint32Array(pid int, addr uintptr, count uint64) ([]uint32, error) { if count == 0 { return nil, nil } if addr == 0 { return nil, syscall.EFAULT } out := make([]uint32, 0, count) for i := uint64(0); i < count; i++ { v, err := readTraceeUint32(pid, addr+uintptr(i*4)) if err != nil { return nil, err } out = append(out, v) } return out, nil } func readTraceeStringVector(pid int, addr uintptr, max int) ([]string, error) { if addr == 0 { return nil, nil } out := []string{} for i := 0; i < max; i++ { p, err := readTraceePtr(pid, addr+uintptr(i*ptrSize)) if err != nil { return out, err } if p == 0 { return out, nil } s, err := readTraceeString(pid, uintptr(p), 4096) if err != nil { return out, err } out = append(out, s) } return out, nil } func readTraceePtr(pid int, addr uintptr) (uint64, error) { buf := make([]byte, ptrSize) _, err := syscall.PtracePeekData(pid, addr, buf) if err != nil { return 0, err } return traceePtr(buf), nil } func writeScratchString(st *traceeState, s string) (uintptr, error) { data := append([]byte(s), 0) addr := uintptr(st.scratch - uint64(len(data)+16)) addr &^= uintptr(15) st.scratch = uint64(addr) _, err := syscall.PtracePokeData(st.pid, addr, data) return addr, err } func writeTraceeStringVector(st *traceeState, values []string) (uintptr, error) { ptrs := make([]uint64, 0, len(values)+1) for _, s := range values { addr, err := writeScratchString(st, s) if err != nil { return 0, err } ptrs = append(ptrs, uint64(addr)) } ptrs = append(ptrs, 0) data := make([]byte, len(ptrs)*ptrSize) for i, p := range ptrs { base := i * ptrSize putTraceePtr(data[base:base+ptrSize], p) } addr := uintptr(st.scratch - uint64(len(data)+16)) addr &^= uintptr(15) st.scratch = uint64(addr) _, err := syscall.PtracePokeData(st.pid, addr, data) return addr, err } func traceeUint32(buf []byte) uint32 { if nativeLittleEndian { return uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 } return uint32(buf[3]) | uint32(buf[2])<<8 | uint32(buf[1])<<16 | uint32(buf[0])<<24 } func putTraceeUint32(buf []byte, v uint32) { if nativeLittleEndian { buf[0] = byte(v) buf[1] = byte(v >> 8) buf[2] = byte(v >> 16) buf[3] = byte(v >> 24) return } buf[0] = byte(v >> 24) buf[1] = byte(v >> 16) buf[2] = byte(v >> 8) buf[3] = byte(v) } func traceePtr(buf []byte) uint64 { var v uint64 if nativeLittleEndian { for i := 0; i < ptrSize; i++ { v |= uint64(buf[i]) << (8 * uint(i)) } return v } for i := 0; i < ptrSize; i++ { v = (v << 8) | uint64(buf[i]) } return v } func putTraceePtr(buf []byte, v uint64) { if nativeLittleEndian { for i := 0; i < ptrSize; i++ { buf[i] = byte(v >> (8 * uint(i))) } return } for i := 0; i < ptrSize; i++ { buf[i] = byte(v >> (8 * uint(ptrSize-1-i))) } }