Files
exec/proot/native_linux.go
T

2334 lines
67 KiB
Go

//go:build linux || android
package proot
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"
"sirherobrine23.com.br/go-bds/exec/v2/process"
prootext "sirherobrine23.com.br/go-bds/exec/v2/proot/extensions/extensions"
)
const atFDCWD = -100
// Linux ptrace tracers must wait with __WALL so clone/vfork/thread stops
// generated by PTRACE_O_TRACE* are visible even when they are not normal
// children according to wait4(2). Without it, helpers spawned by apt/dpkg can
// disappear or stay stopped while the tracer blocks waiting for an event that
// is hidden from plain wait4(-1, ...).
const waitTraceOptions = syscall.WALL
// Keep scratch strings/vectors outside the AMD64 red-zone and similar
// architecture/compiler scratch area below the user stack pointer. Most path
// rewrites are only needed for the duration of a single syscall, but writing
// them immediately below SP can corrupt leaf-function red-zone locals in libc,
// dpkg, or apt helpers. A small gap keeps the injected strings away from that
// live user stack area without going far enough below SP to commonly hit an
// unmapped guard page.
const stackScratchGap = 256
const (
processGroupShutdownTimeout = 90 * time.Second
processGroupKillGrace = 5 * time.Second
)
var errProcessExited = errors.New("process already exited")
type nativeProcess struct {
process *os.Process
pid int
pgid int
stdin io.Reader
stdout, stderr io.Writer
stdinW *os.File
stdoutR *os.File
stderrR *os.File
copyWG sync.WaitGroup // stdin-only; never waited by Wait()
outputWG sync.WaitGroup
childClose []io.Closer
parentClose []io.Closer
cleanup []func() error
stdioStarted bool
tracer *tracer
done chan error
once sync.Once
mu sync.Mutex
exitCode int
exited bool
}
func startNative(pr *Proot, config prootext.Config, cleanups []func() error, options *process.Exec) (*nativeProcess, error) {
pm, err := newPathMapper(config.Rootfs, config.PathResolvers)
if err != nil {
return nil, err
}
cwd := options.Cwd
if cwd == "" {
cwd = "/"
}
cwd = cleanGuestPath(cwd)
hostCwd := pm.GuestToHost(cwd)
args := append([]string(nil), options.Arguments...)
execPath := args[0]
if resolution, handled, err := resolveExecExtensions(config.ExecResolvers, pm, cwd, args); err != nil {
return nil, err
} else if handled {
execPath = resolution.ExecPath
args = resolution.Args
} else {
guestCmd, hostCmd := pm.Translate(cwd, args[0])
if rw, changed := pm.resolveExec(hostCmd, guestCmd, args); changed {
execPath = rw.ExecPath
args = rw.Argv
} else {
execPath = hostCmd
args[0] = hostCmd
}
}
p := &nativeProcess{done: make(chan error, 1), exitCode: -1, cleanup: cleanups}
if pr.proc != nil {
p.stdin, p.stdout, p.stderr = pr.proc.stdin, pr.proc.stdout, pr.proc.stderr
}
if options.Stdin != nil {
p.stdin = options.Stdin
}
if options.Stdout != nil {
p.stdout = options.Stdout
}
if options.Stderr != nil {
p.stderr = options.Stderr
}
files, err := p.prepareFiles()
if err != nil {
return nil, err
}
envOverlay := options.Environment
if envOverlay == nil {
envOverlay = process.Env{}
}
if lp := pm.libraryPath(); lp != "" {
if old := envOverlay["LD_LIBRARY_PATH"]; old != "" {
envOverlay["LD_LIBRARY_PATH"] = lp + ":" + old
} else {
envOverlay["LD_LIBRARY_PATH"] = lp
}
}
envOverlay["PWD"] = cwd
env := mergeEnv(os.Environ(), envOverlay)
attr := &os.ProcAttr{
Dir: hostCwd,
Env: env,
Files: files,
Sys: &syscall.SysProcAttr{
Ptrace: true,
Setpgid: true,
},
}
proc, err := os.StartProcess(execPath, args, attr)
if err != nil {
p.closeAllPipes()
return nil, err
}
// The parent must close the child-side pipe descriptors immediately.
// Otherwise stdout/stderr copy loops never observe EOF because the tracer
// process itself still owns a write end of the pipe.
p.closeChildPipes()
p.process = proc
p.pid = proc.Pid
p.pgid = proc.Pid
p.startCopyLoops()
p.tracer = newTracer(pm, cwd, uint32(pr.UID), uint32(pr.GID))
go func() {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if options.Context != nil {
go func() {
<-options.Context.Done()
p.interruptThenKillAfter(processGroupKillGrace)
}()
}
err := p.tracer.loop(p.pid)
// Only wait for helper processes after a clean ptrace loop. If tracer
// setup itself failed (for example a transient ESRCH from
// PTRACE_SETOPTIONS), the root can still be stopped under ptrace and
// remain visible in the process group forever. Kill that incomplete
// tracee tree immediately instead of waiting the normal 90-second
// package-manager shutdown window.
if err != nil {
p.tracer.debugf("ptrace loop failed for root pid=%d: %v; terminating process group", p.pid, err)
_ = syscall.Kill(-p.pgid, syscall.SIGKILL)
p.reapProcessGroupChildren()
} else {
// The ptrace root can legitimately disappear before package-manager
// helper children in the same process group have finished flushing
// stdout/stderr and modifying the rootfs. Do not report process
// completion until the whole process group is gone.
if waitErr := p.waitProcessGroupGone(processGroupShutdownTimeout); waitErr != nil {
err = waitErr
}
}
// Stop the parent-side stdio plumbing before waiting for copy loops.
// In the root-gone/detach paths the traced process tree may have already
// disappeared without delivering the final pipe EOF in the usual order.
// Closing stdout/stderr readers here wakes the copy goroutines so Wait()
// cannot hang after the ptrace loop has already completed.
p.closeParentInputPipes()
p.closeParentOutputPipes()
p.outputWG.Wait()
err = errors.Join(err, p.cleanupExtensions())
p.mu.Lock()
if p.tracer.exitCodeSet {
p.exitCode = p.tracer.exitCode
}
p.exited = true
p.mu.Unlock()
_ = p.process.Release()
p.done <- err
}()
return p, nil
}
func resolveExecExtensions(resolvers []prootext.ExecResolver, pm *pathMapper, cwd string, args []string) (prootext.ExecResolution, bool, error) {
for i := len(resolvers) - 1; i >= 0; i-- {
resolver := resolvers[i]
resolution, err := resolver.ResolveExec(prootext.ExecRequest{
Cwd: cwd,
Args: append([]string(nil), args...),
Translate: func(cwd, name string, mode prootext.PathMode) (string, string) {
return pm.TranslateMode(cwd, name, mode)
},
})
if err != nil {
return prootext.ExecResolution{}, false, err
}
if resolution.Handled {
return resolution, true, nil
}
}
return prootext.ExecResolution{}, false, nil
}
func (p *nativeProcess) cleanupExtensions() error {
var err error
for i := len(p.cleanup) - 1; i >= 0; i-- {
err = errors.Join(err, p.cleanup[i]())
}
p.cleanup = nil
return err
}
func mergeEnv(base []string, overlay map[string]string) []string {
idx := make(map[string]int, len(base))
out := append([]string(nil), base...)
for i, kv := range out {
if eq := strings.IndexByte(kv, '='); eq >= 0 {
idx[kv[:eq]] = i
}
}
for k, v := range overlay {
kv := k + "=" + v
if i, ok := idx[k]; ok {
out[i] = kv
} else {
idx[k] = len(out)
out = append(out, kv)
}
}
return out
}
func (p *nativeProcess) prepareFiles() ([]*os.File, error) {
files := []*os.File{os.Stdin, os.Stdout, os.Stderr}
if p.stdin != nil {
if f, ok := p.stdin.(*os.File); ok {
files[0] = f
} else {
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
p.stdinW = w
p.childClose = append(p.childClose, r)
p.parentClose = append(p.parentClose, w)
files[0] = r
}
}
if p.stdout != nil {
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
p.stdoutR = r
p.childClose = append(p.childClose, w)
p.parentClose = append(p.parentClose, r)
files[1] = w
}
if p.stderr != nil {
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
p.stderrR = r
p.childClose = append(p.childClose, w)
p.parentClose = append(p.parentClose, r)
files[2] = w
}
return files, nil
}
func (p *nativeProcess) startCopyLoops() {
if p.stdioStarted {
return
}
p.stdioStarted = true
if p.stdin != nil && p.stdinW != nil {
// Do not include stdin in the output wait path. A generic Reader can block
// forever waiting for terminal input, and Wait() must report process exit
// independently from stdin draining.
p.copyWG.Add(1)
go func() {
defer p.copyWG.Done()
_, _ = io.Copy(p.stdinW, p.stdin)
_ = p.stdinW.Close()
}()
}
if p.stdout != nil && p.stdoutR != nil {
p.outputWG.Add(1)
go func() {
defer p.outputWG.Done()
_, _ = io.Copy(p.stdout, p.stdoutR)
}()
}
if p.stderr != nil && p.stderrR != nil {
p.outputWG.Add(1)
go func() {
defer p.outputWG.Done()
_, _ = io.Copy(p.stderr, p.stderrR)
}()
}
}
func (p *nativeProcess) waitProcessGroupGone(timeout time.Duration) error {
if p.pgid <= 0 {
return nil
}
deadline := time.Now().Add(timeout)
interruptSent := false
killSent := false
for {
p.reapProcessGroupChildren()
if !p.processGroupExists() {
return nil
}
now := time.Now()
switch {
case !interruptSent && now.After(deadline):
p.tracer.debugf("process group %d still alive after %s; sending SIGINT", p.pgid, timeout)
_ = syscall.Kill(-p.pgid, syscall.SIGINT)
interruptSent = true
deadline = now.Add(processGroupKillGrace)
case interruptSent && !killSent && now.After(deadline):
p.tracer.debugf("process group %d ignored SIGINT; sending SIGKILL", p.pgid)
_ = syscall.Kill(-p.pgid, syscall.SIGKILL)
killSent = true
deadline = now.Add(time.Second)
case killSent && now.After(deadline):
p.reapProcessGroupChildren()
if p.processGroupExists() {
return fmt.Errorf("proot: process group %d did not exit after SIGKILL", p.pgid)
}
return nil
}
time.Sleep(20 * time.Millisecond)
}
}
func (p *nativeProcess) processGroupExists() bool {
return processGroupExists(p.pgid)
}
func processGroupExists(pgid int) bool {
if pgid <= 0 {
return false
}
live, err := processGroupLiveMembers(pgid)
if err == nil {
return len(live) > 0
}
err = syscall.Kill(-pgid, 0)
return err == nil || errors.Is(err, syscall.EPERM)
}
func processGroupLiveMembers(pgid int) ([]int, error) {
entries, err := os.ReadDir("/proc")
if err != nil {
return nil, err
}
var live []int
for _, entry := range entries {
if !entry.Type().IsDir() {
continue
}
pid, err := strconv.Atoi(entry.Name())
if err != nil {
continue
}
procPGID, state, err := readProcStatProcessGroup(pid)
if err != nil || procPGID != pgid || state == 'Z' {
continue
}
live = append(live, pid)
}
return live, nil
}
func readProcStatProcessGroup(pid int) (int, byte, error) {
data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
if err != nil {
return 0, 0, err
}
end := strings.LastIndexByte(string(data), ')')
if end < 0 || end+2 >= len(data) {
return 0, 0, fmt.Errorf("invalid /proc/%d/stat", pid)
}
state := data[end+2]
fields := strings.Fields(string(data[end+3:]))
if len(fields) < 2 {
return 0, 0, fmt.Errorf("invalid /proc/%d/stat", pid)
}
pgrp, err := strconv.Atoi(fields[1])
if err != nil {
return 0, 0, err
}
return pgrp, state, nil
}
func (p *nativeProcess) reapProcessGroupChildren() {
for {
var ws syscall.WaitStatus
var ru syscall.Rusage
pid, err := syscall.Wait4(-p.pgid, &ws, waitTraceOptions|syscall.WNOHANG, &ru)
if err == syscall.EINTR {
continue
}
if err != nil || pid <= 0 {
return
}
if p.tracer != nil {
p.tracer.debugf("reaped leftover process-group child pid=%d status=%#x", pid, int(ws))
}
}
}
func (p *nativeProcess) interruptThenKillAfter(grace time.Duration) {
_ = p.Signal(os.Interrupt)
time.Sleep(grace)
if !p.isExited() {
_ = p.Kill()
}
}
func (p *nativeProcess) isExited() bool {
p.mu.Lock()
defer p.mu.Unlock()
return p.exited
}
func (p *nativeProcess) closeChildPipes() {
for _, c := range p.childClose {
_ = c.Close()
}
p.childClose = nil
}
func (p *nativeProcess) closeParentInputPipes() {
if p.stdinW != nil {
_ = p.stdinW.Close()
}
}
func (p *nativeProcess) closeParentOutputPipes() {
if p.stdoutR != nil {
_ = p.stdoutR.Close()
}
if p.stderrR != nil {
_ = p.stderrR.Close()
}
}
func (p *nativeProcess) closeAllPipes() {
p.closeChildPipes()
for _, c := range p.parentClose {
_ = c.Close()
}
p.parentClose = nil
}
func (p *nativeProcess) Kill() error {
if p.process == nil {
return errProcessExited
}
if p.pgid > 0 {
return syscall.Kill(-p.pgid, syscall.SIGKILL)
}
return p.process.Kill()
}
func (p *nativeProcess) Signal(sig os.Signal) error {
if p.process == nil {
return errProcessExited
}
s, ok := sig.(syscall.Signal)
if !ok {
return fmt.Errorf("unsupported signal %v", sig)
}
if p.pgid > 0 {
return syscall.Kill(-p.pgid, s)
}
return p.process.Signal(sig)
}
func (p *nativeProcess) Wait() error {
if p.done == nil {
return errProcessExited
}
err := <-p.done
p.done = nil
return err
}
func (p *nativeProcess) ExitCode() (int, error) {
p.mu.Lock()
if p.exited {
code := p.exitCode
p.mu.Unlock()
return code, nil
}
p.mu.Unlock()
if err := p.Wait(); err != nil {
return -1, err
}
p.mu.Lock()
defer p.mu.Unlock()
return p.exitCode, nil
}
type tracer struct {
pm *pathMapper
rootPID int
rootCwd string
uid uint32
gid uint32
debug bool
tracees map[int]*traceeState
pendingTracees map[int]*traceeState
rootGone bool
rootGoneSince time.Time
lostTracee bool
lostTraceeSince time.Time
exitCode int
exitCodeSet bool
}
type traceeState struct {
pid int
parentPID int
inSyscall bool
optionsSet bool
cwd string
scratch uint64
pending *pendingExit
creds fakeCreds
// Keep the unmodified syscall-entry registers until the corresponding
// exit/SIGSYS. On ARM/ARM64 the first argument and return value share r0/x0,
// so an emulated exit can otherwise destroy the argument needed by the
// outer-seccomp handler.
originalRegs syscall.PtraceRegs
originalSysno uint64
originalValid bool
}
type pendingExit struct {
kind string
guestPath string
sourcePath string
targetPath string
buf uint64
size uint64
fd uint64
}
type fakeCreds struct {
ruid, euid, suid uint32
rgid, egid, sgid uint32
fsuid, fsgid uint32
groups []uint32
}
func newFakeCreds(uid, gid uint32) fakeCreds {
return fakeCreds{
ruid: uid, euid: uid, suid: uid, fsuid: uid,
rgid: gid, egid: gid, sgid: gid, fsgid: gid,
groups: []uint32{gid},
}
}
func cloneFakeCreds(c fakeCreds) fakeCreds {
out := c
if c.groups != nil {
out.groups = append([]uint32(nil), c.groups...)
}
return out
}
func noCredID(v uint32) bool { return v == ^uint32(0) }
func (c fakeCreds) uidPrivileged() bool { return c.euid == 0 }
func (c fakeCreds) gidPrivileged() bool { return c.euid == 0 }
func (c fakeCreds) canUseUID(v uint32) bool {
return noCredID(v) || c.uidPrivileged() || v == c.ruid || v == c.euid || v == c.suid
}
func (c fakeCreds) canUseGID(v uint32) bool {
return noCredID(v) || c.gidPrivileged() || v == c.rgid || v == c.egid || v == c.sgid
}
func (c fakeCreds) canSetUID(vals ...uint32) bool {
for _, v := range vals {
if !c.canUseUID(v) {
return false
}
}
return true
}
func (c fakeCreds) canSetGID(vals ...uint32) bool {
for _, v := range vals {
if !c.canUseGID(v) {
return false
}
}
return true
}
func (t *tracer) fakeErrno(st *traceeState, regs *syscall.PtraceRegs, errno syscall.Errno) error {
setSysno(regs, sc.getpid)
st.pending = &pendingExit{kind: "errno", size: uint64(errno)}
return setRegs(st.pid, regs)
}
func newTracer(pm *pathMapper, cwd string, uid, gid uint32) *tracer {
return &tracer{
pm: pm,
rootCwd: cleanGuestPath(cwd),
uid: uid,
gid: gid,
debug: os.Getenv("PROOT_GO_DEBUG") == "1",
tracees: map[int]*traceeState{},
pendingTracees: map[int]*traceeState{},
exitCode: -1,
}
}
func (t *tracer) debugf(format string, args ...any) {
if t.debug {
fmt.Fprintf(os.Stderr, "proot-go: "+format+"\n", args...)
}
}
func (t *tracer) addTracee(pid int, parent *traceeState, cwd string) *traceeState {
st := &traceeState{pid: pid, cwd: cleanGuestPath(cwd), creds: newFakeCreds(t.uid, t.gid)}
if parent != nil {
st.parentPID = parent.pid
st.cwd = parent.cwd
st.creds = cloneFakeCreds(parent.creds)
}
t.tracees[pid] = st
delete(t.pendingTracees, pid)
return st
}
func (t *tracer) queueTracee(pid int, parent *traceeState) {
if pid <= 0 {
return
}
// A newly forked task can report its initial ptrace stop before the
// parent's PTRACE_EVENT_FORK/CLONE is consumed. In that ordering the task
// has already been adopted with the conservative cwd "/". Merge the
// parent state when the event finally arrives instead of returning early;
// otherwise only some members of a shell pipeline inherit chdir(), e.g.
// `cd /usr/share/ca-certificates; find . | sort`.
inheritParent := func(st *traceeState) {
if parent == nil {
return
}
st.parentPID = parent.pid
st.cwd = parent.cwd
st.creds = cloneFakeCreds(parent.creds)
}
if st, ok := t.tracees[pid]; ok {
inheritParent(st)
t.debugf("pid=%d merged late parent pid=%d cwd=%q", pid, st.parentPID, st.cwd)
return
}
if st, ok := t.pendingTracees[pid]; ok {
inheritParent(st)
return
}
st := &traceeState{pid: pid, cwd: "/", creds: newFakeCreds(t.uid, t.gid)}
inheritParent(st)
t.pendingTracees[pid] = st
}
func (t *tracer) adoptTracee(pid int) *traceeState {
if st := t.pendingTracees[pid]; st != nil {
delete(t.pendingTracees, pid)
t.tracees[pid] = st
return st
}
return t.addTracee(pid, nil, "/")
}
func (t *tracer) loop(rootPID int) error {
t.rootPID = rootPID
var ws syscall.WaitStatus
var ru syscall.Rusage
pid, err := syscall.Wait4(rootPID, &ws, waitTraceOptions, &ru)
if err != nil {
return err
}
root := t.addTracee(pid, nil, t.rootCwd)
if ws.Exited() || ws.Signaled() {
t.setExit(ws)
return t.waitErr(ws)
}
if err := setPtraceOptionsRetry(pid, true); err != nil {
t.debugf("pid=%d initial ptrace options failed: %v", pid, err)
_ = syscall.Kill(pid, syscall.SIGKILL)
_ = syscall.PtraceCont(pid, int(syscall.SIGKILL))
return err
}
root.optionsSet = true
if !t.resumeTracee(pid, 0) {
return syscall.ESRCH
}
_ = root
for len(t.tracees) > 0 {
waitOptions := waitTraceOptions
if t.rootGone || t.lostTracee {
waitOptions |= syscall.WNOHANG
}
pid, err = syscall.Wait4(-1, &ws, waitOptions, &ru)
if err == syscall.EINTR {
continue
}
if pid == 0 {
if t.rootGone {
t.pruneDeadTracees()
if len(t.tracees) == 0 {
if !t.exitCodeSet {
t.exitCodeSet = true
t.exitCode = 0
}
break
}
if time.Since(t.rootGoneSince) > 2*time.Second {
t.debugf("root tracee gone; detaching remaining tracees after idle wait: %v", t.traceePids())
t.detachRemainingTracees()
if !t.exitCodeSet {
t.exitCodeSet = true
t.exitCode = 0
}
break
}
time.Sleep(20 * time.Millisecond)
continue
}
if t.lostTracee {
t.pruneDeadTracees()
if time.Since(t.lostTraceeSince) > 500*time.Millisecond {
t.debugf("idle after lost tracee; waking possible waiters: %v", t.traceePids())
t.signalTracees(syscall.SIGCHLD)
t.lostTraceeSince = time.Now()
}
time.Sleep(20 * time.Millisecond)
continue
}
continue
}
if err != nil {
if len(t.tracees) == 0 || err == syscall.ECHILD {
break
}
return err
}
st := t.tracees[pid]
if st == nil {
st = t.adoptTracee(pid)
}
if ws.Exited() || ws.Signaled() {
if pid == rootPID {
t.setExit(ws)
}
delete(t.tracees, pid)
continue
}
if !ws.Stopped() {
if !t.resumeTracee(pid, 0) {
t.markRootGone(pid, rootPID)
}
continue
}
if !st.optionsSet {
if err := setPtraceOptionsRetry(pid, pid == rootPID); err == nil {
st.optionsSet = true
} else {
t.debugf("pid=%d ptrace options failed: %v", pid, err)
if isProcessGoneErr(err) {
t.markRootGone(pid, rootPID)
continue
}
}
}
sig := ws.StopSignal()
event := ptraceEvent(ws)
if sig == syscall.SIGTRAP && event != 0 {
t.handleEvent(pid, st, event)
if !t.resumeTracee(pid, 0) {
t.markRootGone(pid, rootPID)
}
continue
}
if isSyscallStop(ws) {
if err := t.handleSyscall(st); err != nil {
if isProcessGoneErr(err) {
// Some kernels/reporting paths can return ESRCH from PTRACE_PEEKDATA
// while a tracee is in the syscall-stop we are handling. Do not
// treat that as a fatal translation error by itself: first try to
// resume the original syscall so the real parent/children can finish
// normally. This is important for apt/dpkg where the root apt
// process can briefly report ESRCH while dpkg helpers are still
// unpacking packages.
if pid == rootPID {
t.debugf("pid=%d root syscall translation saw process-gone error, trying resume: %v", pid, err)
} else {
t.debugf("pid=%d syscall translation saw process-gone error, trying resume: %v", pid, err)
}
if t.resumeTracee(pid, 0) {
continue
}
t.markRootGone(pid, rootPID)
continue
}
// Do not kill the tracee for a best-effort translation failure;
// continue and let the kernel report the original error if possible.
t.debugf("pid=%d syscall translation failed: %v", pid, err)
}
if !t.resumeTracee(pid, 0) {
t.markRootGone(pid, rootPID)
}
continue
}
if sig == syscall.SIGSYS {
handled, err := t.handleSeccompSIGSYS(st)
if err != nil {
if isProcessGoneErr(err) {
t.markRootGone(pid, rootPID)
continue
}
t.debugf("pid=%d failed handling seccomp SIGSYS: %v", pid, err)
} else if handled {
// SECCOMP_RET_TRAP skips the syscall and reports SIGSYS instead of
// producing the normal syscall-exit stop. Suppress the signal and
// continue at the instruction following the syscall.
if !t.resumeTracee(pid, 0) {
t.markRootGone(pid, rootPID)
}
continue
}
}
// Forward real signals, but do not reinject ptrace's synthetic SIGTRAP/SIGSTOP.
forward := int(sig)
if sig == syscall.SIGTRAP || sig == syscall.SIGSTOP {
forward = 0
}
if !t.resumeTracee(pid, forward) {
t.markRootGone(pid, rootPID)
}
}
if t.exitCodeSet && t.exitCode != 0 {
return fmt.Errorf("exit status %d", t.exitCode)
}
return nil
}
func (t *tracer) traceeGone(pid, rootPID int, where string, err error) {
t.debugf("pid=%d disappeared %s: %v", pid, where, err)
delete(t.tracees, pid)
t.markRootGone(pid, rootPID)
}
func (t *tracer) markRootGone(pid, rootPID int) {
if pid != rootPID {
return
}
if processExists(pid) {
t.terminateLostRoot(pid, rootPID)
}
if !t.rootGone {
t.rootGone = true
t.rootGoneSince = time.Now()
}
// Once the root tracee has disappeared there is no command owner left to
// wait for. Keeping the tracer attached to leftover helpers is unsafe here:
// apt/dpkg can leave transient method/sqv/extracttemplate children that have
// already notified their real parent, while the tracer no longer has a root
// process capable of driving the tree forward. Detach the remaining tracees
// immediately so Wait() can return instead of spinning in WNOHANG forever.
delete(t.tracees, pid)
if len(t.tracees) > 0 {
t.debugf("root tracee gone; detaching remaining tracees immediately: %v", t.traceePids())
t.detachRemainingTracees()
}
// This flag is what makes the main wait loop switch to WNOHANG + pruning.
// Without it, a process-gone root can leave the tracer blocked forever in
// wait4(-1, __WALL) if one of the queued/adopted helper PIDs never delivers
// another wait status. The log that ends with:
//
// root syscall translation saw process-gone error, trying resume: no such process
// disappeared before resume
//
// hits exactly this path.
if !t.rootGone {
t.rootGone = true
t.rootGoneSince = time.Now()
}
// If wait4 never delivered the root's final status, fall back to a clean exit.
// Real non-zero exits/signals still win when they were observed via setExit().
if !t.exitCodeSet {
t.exitCodeSet = true
t.exitCode = 0
}
}
func (t *tracer) pruneDeadTracees() {
for pid := range t.tracees {
if err := syscall.Kill(pid, 0); err != nil && errors.Is(err, syscall.ESRCH) {
t.debugf("pid=%d disappeared while pruning stale tracees", pid)
delete(t.tracees, pid)
}
}
}
func (t *tracer) terminateLostRoot(pid, pgid int) {
t.debugf("root tracee pid=%d is alive but no longer ptrace-controllable; sending SIGINT to process group %d", pid, pgid)
_ = syscall.Kill(-pgid, syscall.SIGINT)
if err := syscall.PtraceCont(pid, int(syscall.SIGINT)); err != nil {
t.debugf("pid=%d ptrace continue with SIGINT failed: %v", pid, err)
}
time.Sleep(500 * time.Millisecond)
if !processGroupExists(pgid) {
return
}
t.debugf("root process group %d still alive after SIGINT; sending SIGKILL", pgid)
_ = syscall.Kill(-pgid, syscall.SIGKILL)
if err := syscall.PtraceCont(pid, int(syscall.SIGKILL)); err != nil {
t.debugf("pid=%d ptrace continue with SIGKILL failed: %v", pid, err)
if err := syscall.PtraceDetach(pid); err != nil && !isProcessGoneErr(err) {
t.debugf("pid=%d ptrace detach after SIGKILL failed: %v", pid, err)
}
}
}
func (t *tracer) traceePids() []int {
pids := make([]int, 0, len(t.tracees))
for pid := range t.tracees {
pids = append(pids, pid)
}
return pids
}
func (t *tracer) detachRemainingTracees() {
for pid := range t.tracees {
if err := syscall.PtraceDetach(pid); err != nil {
if isProcessGoneErr(err) {
t.debugf("pid=%d disappeared before detach", pid)
} else {
t.debugf("pid=%d ptrace detach failed: %v", pid, err)
}
}
delete(t.tracees, pid)
}
}
func (t *tracer) noteLostTracee(pid int) {
if !t.lostTracee {
t.lostTracee = true
}
t.lostTraceeSince = time.Now()
}
func (t *tracer) signalTracees(sig syscall.Signal) {
for pid := range t.tracees {
if err := syscall.Kill(pid, sig); err != nil && !isProcessGoneErr(err) {
t.debugf("pid=%d signal %s failed: %v", pid, sig, err)
}
}
}
func (t *tracer) signalTraceeParent(pid int, sig syscall.Signal) {
st := t.tracees[pid]
if st == nil || st.parentPID == 0 {
return
}
if _, ok := t.tracees[st.parentPID]; !ok {
return
}
if err := syscall.Kill(st.parentPID, sig); err != nil {
if !isProcessGoneErr(err) {
t.debugf("pid=%d signal parent %d with %s failed: %v", pid, st.parentPID, sig, err)
}
return
}
t.debugf("pid=%d disappeared; sent %s to parent pid=%d", pid, sig, st.parentPID)
}
func (t *tracer) reapGoneTracee(pid int) {
// A ptrace resume can return ESRCH after a tracee has already reached its
// final wait state. In that case, the real parent can remain blocked in
// waitpid() until the tracer consumes that pending ptrace status. Drain any
// immediately available status for this pid before forgetting it.
for {
var ws syscall.WaitStatus
var ru syscall.Rusage
r, err := syscall.Wait4(pid, &ws, waitTraceOptions|syscall.WNOHANG, &ru)
if r == 0 {
return
}
if err != nil {
if err != syscall.ECHILD && err != syscall.EINTR {
t.debugf("pid=%d reap after ESRCH failed: %v", pid, err)
}
return
}
if r != pid {
return
}
if ws.Exited() || ws.Signaled() {
t.debugf("pid=%d reaped after ESRCH: status=%#x", pid, int(ws))
delete(t.tracees, pid)
return
}
if ws.Stopped() {
// If it was merely stopped, release it; otherwise the real parent may wait
// forever for a helper that the tracer no longer intends to manage.
sig := 0
if s := ws.StopSignal(); s != syscall.SIGTRAP && s != syscall.SIGSTOP {
sig = int(s)
}
if err := syscall.PtraceSyscall(pid, sig); err != nil && !isProcessGoneErr(err) {
t.debugf("pid=%d resume while reaping after ESRCH failed: %v", pid, err)
}
return
}
}
}
func setPtraceOptions(pid int, traceExit bool) error {
options := syscall.PTRACE_O_TRACESYSGOOD |
syscall.PTRACE_O_TRACEFORK |
syscall.PTRACE_O_TRACEVFORK |
syscall.PTRACE_O_TRACECLONE |
syscall.PTRACE_O_TRACEEXEC
// Do not request PTRACE_O_TRACEEXIT for short-lived helper children. The
// extra synthetic exit-stop can race with helpers such as `stty -a` spawned
// by debconf: PTRACE_PEEKDATA/PTRACE_SYSCALL may report ESRCH before the
// real parent observes completion, leaving apt/dpkg waiting for a child that
// the tracer effectively consumed. The root tracee is different: enabling
// TRACEEXIT only for it gives the tracer one last deterministic stop before
// apt itself exits, avoiding the root "process-gone" path that can otherwise
// make us terminate before reporting the real wait status.
if traceExit {
options |= syscall.PTRACE_O_TRACEEXIT
}
return syscall.PtraceSetOptions(pid, options)
}
func setPtraceOptionsRetry(pid int, traceExit bool) error {
var err error
for attempt := 0; attempt < 20; attempt++ {
err = setPtraceOptions(pid, traceExit)
if err == nil {
return nil
}
if errors.Is(err, syscall.EINTR) {
continue
}
if !isProcessGoneErr(err) || !processExists(pid) {
return err
}
// Android kernels can briefly report ESRCH while the newly exec'd
// tracee is transitioning into its first ptrace stop. The task still
// exists and remains stopped, so retry the option write on the same
// tracee instead of abandoning it.
time.Sleep(5 * time.Millisecond)
}
return err
}
func isSyscallStop(ws syscall.WaitStatus) bool {
return ws.Stopped() && int(ws.StopSignal()) == (int(syscall.SIGTRAP)|0x80)
}
func ptraceEvent(ws syscall.WaitStatus) int {
return int(uint32(ws) >> 16)
}
func (t *tracer) handleEvent(pid int, st *traceeState, event int) {
switch event {
case syscall.PTRACE_EVENT_FORK, syscall.PTRACE_EVENT_VFORK, syscall.PTRACE_EVENT_CLONE:
msg, err := syscall.PtraceGetEventMsg(pid)
if err == nil && msg != 0 {
child := int(msg)
t.queueTracee(child, st)
}
case syscall.PTRACE_EVENT_EXIT:
// Root-only TRACEEXIT is used as a deterministic pre-exit stop. Do not
// set the final status here; resume and let the normal wait status report
// the real exit code/signal.
case syscall.PTRACE_EVENT_EXEC:
// Keep the syscall entry/exit phase unchanged. With PTRACE_SYSCALL,
// Linux may report PTRACE_EVENT_EXEC between execve-enter and
// execve-exit. Resetting inSyscall here makes the following
// execve-exit stop look like a new syscall-enter stop, which then
// flips the phase for the freshly loaded dynamic linker. Once the
// phase is inverted, the loader's openat() calls for the executable
// itself are missed, producing errors such as:
//
// /usr/lib/apt/methods/http: error while loading shared libraries:
// /usr/lib/apt/methods/http: cannot open shared object file
//
// The normal syscall-exit stop will clear st.inSyscall.
}
}
func (t *tracer) setExit(ws syscall.WaitStatus) {
t.exitCodeSet = true
if ws.Exited() {
t.exitCode = ws.ExitStatus()
} else if ws.Signaled() {
t.exitCode = 128 + int(ws.Signal())
} else {
t.exitCode = -1
}
}
func (t *tracer) waitErr(ws syscall.WaitStatus) error {
if ws.Exited() && ws.ExitStatus() == 0 {
return nil
}
if ws.Exited() {
return fmt.Errorf("exit status %d", ws.ExitStatus())
}
if ws.Signaled() {
return fmt.Errorf("process killed by %s", ws.Signal())
}
return nil
}
func isProcessGoneErr(err error) bool {
if err == nil {
return false
}
return errors.Is(err, syscall.ESRCH) || strings.Contains(err.Error(), "no such process")
}
func processExists(pid int) bool {
err := syscall.Kill(pid, 0)
return err == nil || errors.Is(err, syscall.EPERM)
}
func (t *tracer) resumeTracee(pid int, sig int) bool {
if err := syscall.PtraceSyscall(pid, sig); err != nil {
if isProcessGoneErr(err) {
if t.recoverTracee(pid, sig) {
return true
}
t.debugf("pid=%d disappeared before resume", pid)
t.reapGoneTracee(pid)
t.signalTraceeParent(pid, syscall.SIGCHLD)
t.noteLostTracee(pid)
} else {
t.debugf("pid=%d ptrace resume failed: %v", pid, err)
}
delete(t.tracees, pid)
return false
}
return true
}
func (t *tracer) recoverTracee(pid int, sig int) bool {
if !processExists(pid) {
return false
}
t.debugf("pid=%d still exists after ptrace ESRCH; trying PTRACE_ATTACH recovery", pid)
if err := syscall.PtraceAttach(pid); err != nil {
t.debugf("pid=%d ptrace attach recovery failed: %v", pid, err)
return false
}
var ws syscall.WaitStatus
var ru syscall.Rusage
for {
r, err := syscall.Wait4(pid, &ws, waitTraceOptions, &ru)
if err == syscall.EINTR {
continue
}
if err != nil {
t.debugf("pid=%d wait after ptrace attach recovery failed: %v", pid, err)
_ = syscall.PtraceDetach(pid)
return false
}
if r == pid {
break
}
}
if !ws.Stopped() {
t.debugf("pid=%d was not stopped after ptrace attach recovery: status=%#x", pid, int(ws))
_ = syscall.PtraceDetach(pid)
return false
}
st := t.tracees[pid]
if st == nil {
st = t.adoptTracee(pid)
}
st.inSyscall = false
st.pending = nil
if err := setPtraceOptions(pid, pid == t.rootPID); err != nil {
t.debugf("pid=%d ptrace option recovery failed: %v", pid, err)
_ = syscall.PtraceDetach(pid)
return false
}
st.optionsSet = true
if err := syscall.PtraceSyscall(pid, sig); err != nil {
t.debugf("pid=%d ptrace resume after recovery failed: %v", pid, err)
_ = syscall.PtraceDetach(pid)
return false
}
t.debugf("pid=%d ptrace recovery succeeded", pid)
return true
}
// isLocallyEmulatedSyscall reports whether onSyscallEnter replaces the
// syscall with a harmless host call and synthesizes its guest-visible result.
// An outer Android seccomp filter may still report SIGSYS for the original
// syscall after that synthetic exit has already been processed. In that case
// the result must be preserved instead of being overwritten with ENOSYS.
func isLocallyEmulatedSyscall(nr uint64) bool {
switch nr {
case sc.chown, sc.lchown, sc.fchown, sc.fchownat,
sc.getuid, sc.geteuid, sc.getgid, sc.getegid,
sc.getresuid, sc.getresgid, sc.getgroups,
sc.setuid, sc.setgid, sc.setreuid, sc.setregid,
sc.setresuid, sc.setresgid, sc.setgroups,
sc.setfsuid, sc.setfsgid, sc.chroot:
return nr != noSys
default:
return false
}
}
func (t *tracer) handleSeccompSIGSYS(st *traceeState) (bool, error) {
info, err := ptraceGetSeccompSiginfo(st.pid)
if err != nil {
return false, err
}
if info.Code != sysSeccompCode {
t.debugf("pid=%d received non-seccomp SIGSYS: si_code=%d", st.pid, info.Code)
return false, nil
}
regs, err := getRegs(st.pid)
if err != nil {
return false, err
}
trappedSysno := uint64(uint32(info.Syscall))
logRegs := regs
if st.originalValid && st.originalSysno == trappedSysno {
// This is the Go equivalent of PRoot's ORIGINAL register snapshot.
// In particular, it retains x0/r0 before a synthesized sysexit writes
// the return value into the same register.
logRegs = &st.originalRegs
}
name := "syscall"
if trappedSysno == sc.setRobust {
name = "set_robust_list"
}
// Some Android policies trap link/linkat instead of returning an errno.
// The translated host paths were retained at syscall-entry, so perform the
// same backup-copy fallback here and suppress the SIGSYS.
if st.originalValid && st.originalSysno == trappedSysno && st.pending != nil && st.pending.kind == "hardlink" {
p := st.pending
if err := emulateHardLinkFallback(p.sourcePath, p.targetPath, p.size); err != nil {
setRetval(regs, -int64(errnoFromError(err)))
t.debugf("pid=%d seccomp hardlink fallback failed source=%q target=%q: %v", st.pid, p.sourcePath, p.targetPath, err)
} else {
setRetval(regs, 0)
t.debugf("pid=%d seccomp hardlink fallback copied source=%q target=%q", st.pid, p.sourcePath, p.targetPath)
}
st.inSyscall = false
st.pending = nil
st.originalValid = false
if err := setRegs(st.pid, regs); err != nil {
return false, err
}
return true, nil
}
// When the syscall was already emulated by onSyscallEnter, the ordinary
// syscall-exit handler may have run before this outer-seccomp SIGSYS. This
// is the ordering handled by termux/proot's synthesized-sysexit path. Do not
// replace the emulated success/error with ENOSYS. If the exit handler has
// not run yet, run it now from the saved pending operation.
if st.originalValid && st.originalSysno == trappedSysno && isLocallyEmulatedSyscall(trappedSysno) {
if st.pending != nil {
st.inSyscall = false
if err := t.onSyscallExit(st, regs); err != nil {
return false, err
}
t.debugf("pid=%d synthesized syscall exit for emulated syscall=%d before suppressing SIGSYS", st.pid, trappedSysno)
} else {
t.debugf("pid=%d preserving completed emulation for syscall=%d while suppressing SIGSYS", st.pid, trappedSysno)
}
st.inSyscall = false
st.pending = nil
st.originalValid = false
return true, nil
}
t.debugf("pid=%d handling seccomp SIGSYS for %s=%d as ENOSYS (inSyscall=%t args=%d,%d,%d,%d,%d,%d)",
st.pid, name, trappedSysno, st.inSyscall,
arg(logRegs, 0), arg(logRegs, 1), arg(logRegs, 2),
arg(logRegs, 3), arg(logRegs, 4), arg(logRegs, 5))
// The blocked syscall has no ordinary exit stop to process. Reset the
// entry/exit state before resuming, discard any result rewrite associated
// with that call, and expose the same ENOSYS fallback used by PRoot for
// set_robust_list and unsupported outer-seccomp calls such as rseq.
st.inSyscall = false
st.pending = nil
st.originalValid = false
setRetval(regs, -int64(syscall.ENOSYS))
if err := setRegs(st.pid, regs); err != nil {
return false, err
}
return true, nil
}
func (t *tracer) handleSyscall(st *traceeState) error {
regs, err := getRegs(st.pid)
if err != nil {
return err
}
if !st.inSyscall {
st.inSyscall = true
st.scratch = stackPtr(regs) - stackScratchGap
st.originalRegs = *regs
st.originalSysno = sysno(regs)
st.originalValid = true
return t.onSyscallEnter(st, regs)
}
st.inSyscall = false
// Keep ORIGINAL until the next syscall entry. Some Android kernels report
// the outer-seccomp SIGSYS after the synthesized syscall-exit stop; clearing
// it here would lose x0/r0 before the SIGSYS handler can inspect it.
return t.onSyscallExit(st, regs)
}
func (t *tracer) onSyscallEnter(st *traceeState, regs *syscall.PtraceRegs) error {
nr := sysno(regs)
st.pending = nil
switch nr {
case sc.execve:
return t.translateExecve(st, regs, 0, 1, atFDCWD)
case sc.execveat:
dirfd := int(int64(arg(regs, 0)))
return t.translateExecve(st, regs, 1, 2, dirfd)
case sc.open:
return t.translateArgPathMode(st, regs, 0, atFDCWD, pathModeFromOpenFlags(arg(regs, 1)))
case sc.openat:
dirfd := int(int64(arg(regs, 0)))
return t.translateArgPathMode(st, regs, 1, dirfd, pathModeFromOpenFlags(arg(regs, 2)))
case sc.openat2:
dirfd := int(int64(arg(regs, 0)))
return t.translateArgPathMode(st, regs, 1, dirfd, t.pathModeFromOpenHow(st, arg(regs, 2)))
case sc.access, sc.stat, sc.lstat, sc.statfs, sc.statfs64:
return t.translateArgPath(st, regs, 0, atFDCWD)
case sc.chmod, sc.truncate, sc.utime, sc.utimes:
return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathWrite)
case sc.rmdir:
return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathDeleteDir)
case sc.chown, sc.lchown, sc.fchown, sc.fchownat:
setSysno(regs, sc.getpid)
st.pending = &pendingExit{kind: "fakezero"}
return setRegs(st.pid, regs)
case sc.faccessat, sc.faccessat2, sc.newfstatat, sc.fstatat, sc.statx:
dirfd := int(int64(arg(regs, 0)))
return t.translateArgPath(st, regs, 1, dirfd)
case sc.mkdirat, sc.mknodat:
dirfd := int(int64(arg(regs, 0)))
return t.translateArgPathMode(st, regs, 1, dirfd, prootext.PathCreate)
case sc.unlinkat:
dirfd := int(int64(arg(regs, 0)))
mode := prootext.PathDelete
if arg(regs, 2)&0x200 != 0 {
mode = prootext.PathDeleteDir
}
return t.translateArgPathMode(st, regs, 1, dirfd, mode)
case sc.fchmodat, sc.utimensat:
dirfd := int(int64(arg(regs, 0)))
return t.translateArgPathMode(st, regs, 1, dirfd, prootext.PathWrite)
case sc.mkdir, sc.mknod:
return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathCreate)
case sc.unlink:
return t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathDelete)
case sc.rename:
if err := t.translateArgPathMode(st, regs, 0, atFDCWD, prootext.PathWrite); err != nil {
return err
}
return t.translateArgPathMode(st, regs, 1, atFDCWD, prootext.PathCreate)
case sc.renameat, sc.renameat2:
oldfd := int(int64(arg(regs, 0)))
newfd := int(int64(arg(regs, 2)))
if err := t.translateArgPathMode(st, regs, 1, oldfd, prootext.PathWrite); err != nil {
return err
}
return t.translateArgPathMode(st, regs, 3, newfd, prootext.PathCreate)
case sc.link:
return t.translateHardLink(st, regs, 0, atFDCWD, 1, atFDCWD, 0)
case sc.linkat:
oldfd := int(int64(arg(regs, 0)))
newfd := int(int64(arg(regs, 2)))
return t.translateHardLink(st, regs, 1, oldfd, 3, newfd, arg(regs, 4))
case sc.symlink:
if err := t.translateSymlinkTarget(st, regs, 0); err != nil {
return err
}
return t.translateArgPathMode(st, regs, 1, atFDCWD, prootext.PathCreate)
case sc.symlinkat:
if err := t.translateSymlinkTarget(st, regs, 0); err != nil {
return err
}
dirfd := int(int64(arg(regs, 1)))
return t.translateArgPathMode(st, regs, 2, dirfd, prootext.PathCreate)
case sc.readlink:
guest, err := t.translateArgPathReturnGuest(st, regs, 0, atFDCWD)
if err != nil {
return err
}
st.pending = &pendingExit{kind: "readlink", guestPath: guest, buf: arg(regs, 1), size: arg(regs, 2)}
return nil
case sc.readlinkat:
dirfd := int(int64(arg(regs, 0)))
guest, err := t.translateArgPathReturnGuest(st, regs, 1, dirfd)
if err != nil {
return err
}
st.pending = &pendingExit{kind: "readlink", guestPath: guest, buf: arg(regs, 2), size: arg(regs, 3)}
return nil
case sc.chdir:
guest, err := t.translateArgPathReturnGuest(st, regs, 0, atFDCWD)
if err != nil {
return err
}
st.pending = &pendingExit{kind: "chdir", guestPath: guest}
return nil
case sc.fchdir:
st.pending = &pendingExit{kind: "fchdir", fd: arg(regs, 0)}
return nil
case sc.getcwd:
st.pending = &pendingExit{kind: "getcwd", buf: arg(regs, 0), size: arg(regs, 1)}
return nil
case sc.getuid:
st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.ruid)}
return nil
case sc.geteuid:
st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.euid)}
return nil
case sc.getgid:
st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.rgid)}
return nil
case sc.getegid:
st.pending = &pendingExit{kind: "ret", size: uint64(st.creds.egid)}
return nil
case sc.getresuid:
st.pending = &pendingExit{kind: "getresuid", buf: arg(regs, 0), size: arg(regs, 1), fd: arg(regs, 2)}
return nil
case sc.getresgid:
st.pending = &pendingExit{kind: "getresgid", buf: arg(regs, 0), size: arg(regs, 1), fd: arg(regs, 2)}
return nil
case sc.getgroups:
st.pending = &pendingExit{kind: "getgroups", size: arg(regs, 0), buf: arg(regs, 1)}
return nil
case sc.setuid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
uid := uint32(arg(regs, 0))
if !st.creds.canUseUID(uid) {
t.debugf("pid=%d deny setuid(%d): r/e/s=%d/%d/%d", st.pid, uid, st.creds.ruid, st.creds.euid, st.creds.suid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
kind := "setuid"
if !st.creds.uidPrivileged() {
kind = "seteuid"
}
st.pending = &pendingExit{kind: kind, size: uint64(uid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setgid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
gid := uint32(arg(regs, 0))
if !st.creds.canUseGID(gid) {
t.debugf("pid=%d deny setgid(%d): r/e/s=%d/%d/%d", st.pid, gid, st.creds.rgid, st.creds.egid, st.creds.sgid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
kind := "setgid"
if !st.creds.gidPrivileged() {
kind = "setegid"
}
st.pending = &pendingExit{kind: kind, size: uint64(gid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setreuid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
ruid, euid := uint32(arg(regs, 0)), uint32(arg(regs, 1))
if !st.creds.canSetUID(ruid, euid) {
t.debugf("pid=%d deny setreuid(%d,%d): r/e/s=%d/%d/%d", st.pid, ruid, euid, st.creds.ruid, st.creds.euid, st.creds.suid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
st.pending = &pendingExit{kind: "setreuid", buf: uint64(ruid), size: uint64(euid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setregid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
rgid, egid := uint32(arg(regs, 0)), uint32(arg(regs, 1))
if !st.creds.canSetGID(rgid, egid) {
t.debugf("pid=%d deny setregid(%d,%d): r/e/s=%d/%d/%d", st.pid, rgid, egid, st.creds.rgid, st.creds.egid, st.creds.sgid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
st.pending = &pendingExit{kind: "setregid", buf: uint64(rgid), size: uint64(egid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setresuid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
ruid, euid, suid := uint32(arg(regs, 0)), uint32(arg(regs, 1)), uint32(arg(regs, 2))
if !st.creds.canSetUID(ruid, euid, suid) {
t.debugf("pid=%d deny setresuid(%d,%d,%d): r/e/s=%d/%d/%d", st.pid, ruid, euid, suid, st.creds.ruid, st.creds.euid, st.creds.suid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
st.pending = &pendingExit{kind: "setresuid", buf: uint64(ruid), size: uint64(euid), fd: uint64(suid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setresgid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
rgid, egid, sgid := uint32(arg(regs, 0)), uint32(arg(regs, 1)), uint32(arg(regs, 2))
if !st.creds.canSetGID(rgid, egid, sgid) {
t.debugf("pid=%d deny setresgid(%d,%d,%d): r/e/s=%d/%d/%d", st.pid, rgid, egid, sgid, st.creds.rgid, st.creds.egid, st.creds.sgid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
st.pending = &pendingExit{kind: "setresgid", buf: uint64(rgid), size: uint64(egid), fd: uint64(sgid)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setgroups:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
if !st.creds.gidPrivileged() {
t.debugf("pid=%d deny setgroups: euid=%d", st.pid, st.creds.euid)
return t.fakeErrno(st, regs, syscall.EPERM)
}
groups, err := readTraceeUint32Array(st.pid, uintptr(arg(regs, 1)), arg(regs, 0))
if err != nil {
// setgroups() is part of apt's privilege drop. If the group vector
// cannot be read because the tracee is between exec mappings or uses a
// special pointer, still emulate success instead of letting the real
// unprivileged syscall run and fail with EPERM.
t.debugf("pid=%d failed reading setgroups vector: %v", st.pid, err)
groups = nil
}
st.creds.groups = groups
setSysno(regs, sc.getpid)
st.pending = &pendingExit{kind: "fakezero"}
return setRegs(st.pid, regs)
case sc.setfsuid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
old := st.creds.fsuid
uid := uint32(arg(regs, 0))
if st.creds.canUseUID(uid) {
st.creds.fsuid = uid
}
st.pending = &pendingExit{kind: "ret", size: uint64(old)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.setfsgid:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
old := st.creds.fsgid
gid := uint32(arg(regs, 0))
if st.creds.canUseGID(gid) {
st.creds.fsgid = gid
}
st.pending = &pendingExit{kind: "ret", size: uint64(old)}
setSysno(regs, sc.getpid)
return setRegs(st.pid, regs)
case sc.chroot:
t.debugf("pid=%d fake credential syscall nr=%d args=%d,%d,%d", st.pid, nr, arg(regs, 0), arg(regs, 1), arg(regs, 2))
setSysno(regs, sc.getpid)
st.pending = &pendingExit{kind: "fakezero"}
return setRegs(st.pid, regs)
}
return nil
}
func (t *tracer) onSyscallExit(st *traceeState, regs *syscall.PtraceRegs) error {
if st.pending == nil {
return nil
}
p := st.pending
st.pending = nil
retv := retval(regs)
switch p.kind {
case "hardlink":
if retv == 0 {
return nil
}
if !shouldFallbackHardLink(retv) {
return nil
}
if err := emulateHardLinkFallback(p.sourcePath, p.targetPath, p.size); err != nil {
t.debugf("pid=%d hardlink fallback failed source=%q target=%q: %v", st.pid, p.sourcePath, p.targetPath, err)
setRetval(regs, -int64(errnoFromError(err)))
return setRegs(st.pid, regs)
}
t.debugf("pid=%d hardlink fallback copied source=%q target=%q after native errno=%d", st.pid, p.sourcePath, p.targetPath, -retv)
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "chdir":
if retv == 0 {
st.cwd = cleanGuestPath(p.guestPath)
}
case "fchdir":
if retv == 0 {
if guest, ok := t.fdGuestPath(st.pid, int(p.fd)); ok {
st.cwd = cleanGuestPath(guest)
}
}
case "ret":
setRetval(regs, int64(uint32(p.size)))
return setRegs(st.pid, regs)
case "errno":
setRetval(regs, -int64(syscall.Errno(p.size)))
return setRegs(st.pid, regs)
case "fakezero":
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setuid":
uid := uint32(p.size)
st.creds.ruid, st.creds.euid, st.creds.suid, st.creds.fsuid = uid, uid, uid, uid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "seteuid":
uid := uint32(p.size)
st.creds.euid, st.creds.fsuid = uid, uid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setgid":
gid := uint32(p.size)
st.creds.rgid, st.creds.egid, st.creds.sgid, st.creds.fsgid = gid, gid, gid, gid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setegid":
gid := uint32(p.size)
st.creds.egid, st.creds.fsgid = gid, gid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setreuid":
if v := uint32(p.buf); v != ^uint32(0) {
st.creds.ruid = v
}
if v := uint32(p.size); v != ^uint32(0) {
st.creds.euid = v
st.creds.fsuid = v
}
st.creds.suid = st.creds.euid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setregid":
if v := uint32(p.buf); v != ^uint32(0) {
st.creds.rgid = v
}
if v := uint32(p.size); v != ^uint32(0) {
st.creds.egid = v
st.creds.fsgid = v
}
st.creds.sgid = st.creds.egid
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setresuid":
if v := uint32(p.buf); v != ^uint32(0) {
st.creds.ruid = v
}
if v := uint32(p.size); v != ^uint32(0) {
st.creds.euid = v
st.creds.fsuid = v
}
if v := uint32(p.fd); v != ^uint32(0) {
st.creds.suid = v
}
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "setresgid":
if v := uint32(p.buf); v != ^uint32(0) {
st.creds.rgid = v
}
if v := uint32(p.size); v != ^uint32(0) {
st.creds.egid = v
st.creds.fsgid = v
}
if v := uint32(p.fd); v != ^uint32(0) {
st.creds.sgid = v
}
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "getresuid":
_ = writeTraceeUint32(st.pid, uintptr(p.buf), st.creds.ruid)
_ = writeTraceeUint32(st.pid, uintptr(p.size), st.creds.euid)
_ = writeTraceeUint32(st.pid, uintptr(p.fd), st.creds.suid)
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "getresgid":
_ = writeTraceeUint32(st.pid, uintptr(p.buf), st.creds.rgid)
_ = writeTraceeUint32(st.pid, uintptr(p.size), st.creds.egid)
_ = writeTraceeUint32(st.pid, uintptr(p.fd), st.creds.sgid)
setRetval(regs, 0)
return setRegs(st.pid, regs)
case "getgroups":
if p.size == 0 {
setRetval(regs, int64(len(st.creds.groups)))
return setRegs(st.pid, regs)
}
if p.size < uint64(len(st.creds.groups)) {
setRetval(regs, -int64(syscall.EINVAL))
return setRegs(st.pid, regs)
}
for i, g := range st.creds.groups {
_ = writeTraceeUint32(st.pid, uintptr(p.buf)+uintptr(i*4), g)
}
setRetval(regs, int64(len(st.creds.groups)))
return setRegs(st.pid, regs)
case "getcwd":
if p.size == 0 {
setRetval(regs, -int64(syscall.ERANGE))
return setRegs(st.pid, regs)
}
data := append([]byte(cleanGuestPath(st.cwd)), 0)
if uint64(len(data)) > p.size {
setRetval(regs, -int64(syscall.ERANGE))
return setRegs(st.pid, regs)
}
if _, err := syscall.PtracePokeData(st.pid, uintptr(p.buf), data); err != nil {
return err
}
setRetval(regs, int64(len(data)))
return setRegs(st.pid, regs)
case "readlink":
if retv <= 0 {
return nil
}
buf := make([]byte, retv)
if _, err := syscall.PtracePeekData(st.pid, uintptr(p.buf), buf); err != nil {
return nil
}
target := string(buf)
if strings.HasPrefix(target, t.pm.root) || filepath.IsAbs(target) {
guest := t.pm.HostToGuest(target)
if guest != target && guest != "" {
out := []byte(guest)
if uint64(len(out)) > p.size {
out = out[:p.size]
}
if _, err := syscall.PtracePokeData(st.pid, uintptr(p.buf), out); err == nil {
setRetval(regs, int64(len(out)))
return setRegs(st.pid, regs)
}
}
}
}
return nil
}
func (t *tracer) translateSymlinkTarget(st *traceeState, regs *syscall.PtraceRegs, n int) error {
addr := arg(regs, n)
if addr == 0 {
return nil
}
old, err := readTraceeString(st.pid, uintptr(addr), 4096)
if err != nil || old == "" {
return err
}
// The link name is translated separately. The link *target*, however, is
// stored verbatim by the kernel. If a guest creates an absolute symlink such
// as /tmp/apt-dpkg-install/00-foo.deb -> /var/cache/apt/archives/foo.deb and
// we leave the target unchanged, the host kernel later follows it outside the
// rootfs and dpkg sees ENOENT. Store the host target for absolute links, and
// translate it back on readlink() exit so guest-visible semantics remain
// /var/cache/apt/archives/foo.deb.
if !filepath.IsAbs(old) {
return nil
}
guest, host, special := t.translateSpecialProcPath(st, "/", old)
if !special {
guest, host = t.pm.Translate("/", old)
}
t.debugf("pid=%d symlink-target old=%q guest=%q host=%q", st.pid, old, guest, host)
newAddr, err := writeTraceeString(st, uintptr(addr), old, host)
if err != nil {
return err
}
setArg(regs, n, uint64(newAddr))
return setRegs(st.pid, regs)
}
const (
atSymlinkFollow = 0x400
atEmptyPath = 0x1000
)
// translateHardLink rewrites both link/linkat path arguments and retains the
// translated host paths until syscall-exit. Android kernels and filesystems can
// reject otherwise valid hard links from an unprivileged tracee; dpkg relies on
// those links for rollback files, so the exit handler can create a regular copy
// when the native operation fails with a policy/filesystem error.
func (t *tracer) translateHardLink(st *traceeState, regs *syscall.PtraceRegs, oldArg, oldDirfd, newArg, newDirfd int, flags uint64) error {
if err := t.translateArgPath(st, regs, oldArg, oldDirfd); err != nil {
return err
}
source, err := readTraceeString(st.pid, uintptr(arg(regs, oldArg)), 4096)
if err != nil {
return err
}
if source == "" && flags&atEmptyPath != 0 && oldDirfd >= 0 {
source, err = os.Readlink(fmt.Sprintf("/proc/%d/fd/%d", st.pid, oldDirfd))
if err != nil {
return err
}
}
if err := t.translateArgPathMode(st, regs, newArg, newDirfd, prootext.PathCreate); err != nil {
return err
}
target, err := readTraceeString(st.pid, uintptr(arg(regs, newArg)), 4096)
if err != nil {
return err
}
st.pending = &pendingExit{
kind: "hardlink",
sourcePath: source,
targetPath: target,
size: flags,
}
return nil
}
func shouldFallbackHardLink(retv int64) bool {
if retv >= 0 {
return false
}
switch syscall.Errno(-retv) {
case syscall.EPERM, syscall.EACCES, syscall.EXDEV, syscall.ENOSYS, syscall.EOPNOTSUPP:
return true
default:
return false
}
}
// emulateHardLinkFallback implements the backup semantics needed by dpkg when
// the host rejects link/linkat. A regular file is copied with O_EXCL so the
// operation keeps link(2)'s no-overwrite behavior. Symlinks are recreated when
// linkat was not asked to follow them. This is deliberately a fallback: native
// hard links remain the primary path and retain full inode-sharing semantics.
func emulateHardLinkFallback(source, target string, flags uint64) (retErr error) {
info, err := os.Lstat(source)
if err != nil {
return err
}
if info.IsDir() {
return syscall.EPERM
}
if info.Mode()&os.ModeSymlink != 0 && flags&atSymlinkFollow == 0 {
value, err := os.Readlink(source)
if err != nil {
return err
}
return os.Symlink(value, target)
}
if info.Mode()&os.ModeSymlink != 0 {
info, err = os.Stat(source)
if err != nil {
return err
}
}
if !info.Mode().IsRegular() {
return syscall.EPERM
}
src, err := os.Open(source)
if err != nil {
return err
}
defer src.Close()
dst, err := os.OpenFile(target, os.O_WRONLY|os.O_CREATE|os.O_EXCL, info.Mode().Perm())
if err != nil {
return err
}
created := true
defer func() {
if closeErr := dst.Close(); retErr == nil && closeErr != nil {
retErr = closeErr
}
if retErr != nil && created {
_ = os.Remove(target)
}
}()
if _, err := io.Copy(dst, src); err != nil {
return err
}
if err := dst.Sync(); err != nil {
return err
}
if err := dst.Chmod(info.Mode().Perm()); err != nil {
return err
}
created = false
return nil
}
func errnoFromError(err error) syscall.Errno {
var errno syscall.Errno
if errors.As(err, &errno) {
return errno
}
return syscall.EIO
}
func (t *tracer) translateArgPath(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int) error {
_, err := t.translateArgPathReturnGuestMode(st, regs, n, dirfd, prootext.PathRead)
return err
}
func (t *tracer) translateArgPathMode(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int, mode prootext.PathMode) error {
_, err := t.translateArgPathReturnGuestMode(st, regs, n, dirfd, mode)
return err
}
func (t *tracer) translateArgPathReturnGuest(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int) (string, error) {
return t.translateArgPathReturnGuestMode(st, regs, n, dirfd, prootext.PathRead)
}
func (t *tracer) translateArgPathReturnGuestMode(st *traceeState, regs *syscall.PtraceRegs, n int, dirfd int, mode prootext.PathMode) (string, error) {
addr := arg(regs, n)
if addr == 0 {
return "", nil
}
old, err := readTraceeString(st.pid, uintptr(addr), 4096)
if err != nil {
return "", err
}
if old == "" {
return "", nil
}
cwd := st.cwd
if dirfd != atFDCWD && !strings.HasPrefix(old, "/") {
if guest, ok := t.fdGuestPath(st.pid, dirfd); ok {
cwd = guest
}
}
guest, host, special := t.translateSpecialProcPath(st, cwd, old)
if !special {
guest, host = t.pm.TranslateMode(cwd, old, mode)
}
t.debugf("pid=%d path mode=%d cwd=%q old=%q guest=%q host=%q", st.pid, mode, cwd, old, guest, host)
newAddr, err := writeTraceeString(st, uintptr(addr), old, host)
if err != nil {
return guest, err
}
setArg(regs, n, uint64(newAddr))
return guest, setRegs(st.pid, regs)
}
func pathModeFromOpenFlags(flags uint64) prootext.PathMode {
const accessMode = uint64(os.O_RDONLY | os.O_WRONLY | os.O_RDWR)
switch flags & accessMode {
case uint64(os.O_WRONLY), uint64(os.O_RDWR):
if flags&uint64(os.O_CREATE|os.O_TRUNC) != 0 {
return prootext.PathCreate
}
return prootext.PathWrite
default:
return prootext.PathRead
}
}
func (t *tracer) pathModeFromOpenHow(st *traceeState, addr uint64) prootext.PathMode {
flags, err := readTraceeUint64(st.pid, uintptr(addr))
if err != nil {
return prootext.PathWrite
}
return pathModeFromOpenFlags(flags)
}
func (t *tracer) fdGuestPath(pid int, fd int) (string, bool) {
if fd < 0 {
return "", false
}
link := fmt.Sprintf("/proc/%d/fd/%d", pid, fd)
target, err := os.Readlink(link)
if err != nil {
return "", false
}
return t.pm.HostToGuest(target), true
}
func (t *tracer) translateSpecialProcPath(st *traceeState, cwd, p string) (guest, host string, ok bool) {
guest = joinGuest(cwd, p)
// /proc is intentionally not bound by default because walking the host
// procfs from commands such as `find /` races disappearing tasks. Some
// programs, however, require the process-local procfs aliases, especially
// apt/dpkg probing /proc/self/fd. Provide just those aliases without
// exposing the entire host /proc tree as a guest bind.
if guest == "/proc/self" || strings.HasPrefix(guest, "/proc/self/") {
suffix := strings.TrimPrefix(guest, "/proc/self")
return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), suffix)), true
}
if guest == "/proc/thread-self" || strings.HasPrefix(guest, "/proc/thread-self/") {
suffix := strings.TrimPrefix(guest, "/proc/thread-self")
return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), "task", strconv.Itoa(st.pid), suffix)), true
}
// If the tracee uses the real PID returned by getpid(), keep self-like
// entries usable even with DefaultBinds not containing /proc. Do not map
// arbitrary /proc/<other-pid> paths; that would reintroduce the host procfs
// traversal problem this backend tries to avoid by default.
pidPrefix := "/proc/" + strconv.Itoa(st.pid)
if guest == pidPrefix || strings.HasPrefix(guest, pidPrefix+"/") {
suffix := strings.TrimPrefix(guest, pidPrefix)
switch {
case suffix == "", suffix == "/fd", strings.HasPrefix(suffix, "/fd/"),
suffix == "/fdinfo", strings.HasPrefix(suffix, "/fdinfo/"),
suffix == "/cwd", suffix == "/exe", suffix == "/root", suffix == "/status":
return guest, filepath.Clean(filepath.Join("/proc", strconv.Itoa(st.pid), suffix)), true
}
}
return "", "", false
}
func readTraceeString(pid int, addr uintptr, max int) (string, error) {
if addr == 0 {
return "", nil
}
var out []byte
buf := make([]byte, 256)
for len(out) < max {
n, err := syscall.PtracePeekData(pid, addr+uintptr(len(out)), buf)
if err != nil {
return "", err
}
for i := 0; i < n; i++ {
if buf[i] == 0 {
return string(out), nil
}
out = append(out, buf[i])
if len(out) >= max {
break
}
}
}
return string(out), nil
}
func writeTraceeString(st *traceeState, oldAddr uintptr, old, new string) (uintptr, error) {
data := append([]byte(new), 0)
addr := uintptr(st.scratch - uint64(len(data)+16))
addr &^= uintptr(15)
st.scratch = uint64(addr)
_, err := syscall.PtracePokeData(st.pid, addr, data)
return addr, err
}
func (t *tracer) translateExecve(st *traceeState, regs *syscall.PtraceRegs, pathArg, argvArg int, dirfd int) error {
pathAddr := arg(regs, pathArg)
if pathAddr == 0 {
return nil
}
old, err := readTraceeString(st.pid, uintptr(pathAddr), 4096)
if err != nil || old == "" {
return err
}
cwd := st.cwd
if dirfd != atFDCWD && !strings.HasPrefix(old, "/") {
if guest, ok := t.fdGuestPath(st.pid, dirfd); ok {
cwd = guest
}
}
guest, host, special := t.translateSpecialProcPath(st, cwd, old)
if !special {
guest, host = t.pm.Translate(cwd, old)
}
t.debugf("pid=%d exec cwd=%q old=%q guest=%q host=%q", st.pid, cwd, old, guest, host)
argv, _ := readTraceeStringVector(st.pid, uintptr(arg(regs, argvArg)), 4096)
if len(argv) == 0 {
argv = []string{old}
}
if rw, changed := t.pm.resolveExec(host, guest, argv); changed {
t.debugf("pid=%d exec rewrite execPath=%q argv=%q", st.pid, rw.ExecPath, strings.Join(rw.Argv, " "))
newPath, err := writeScratchString(st, rw.ExecPath)
if err != nil {
return err
}
argvPtr, err := writeTraceeStringVector(st, rw.Argv)
if err != nil {
return err
}
setArg(regs, pathArg, uint64(newPath))
setArg(regs, argvArg, uint64(argvPtr))
return setRegs(st.pid, regs)
}
newAddr, err := writeTraceeString(st, uintptr(pathAddr), old, host)
if err != nil {
return err
}
setArg(regs, pathArg, uint64(newAddr))
return setRegs(st.pid, regs)
}
func writeTraceeUint32(pid int, addr uintptr, v uint32) error {
if addr == 0 {
return nil
}
data := make([]byte, 4)
putTraceeUint32(data, v)
_, err := syscall.PtracePokeData(pid, addr, data)
return err
}
func readTraceeUint32(pid int, addr uintptr) (uint32, error) {
buf := make([]byte, 4)
if _, err := syscall.PtracePeekData(pid, addr, buf); err != nil {
return 0, err
}
return traceeUint32(buf), nil
}
func readTraceeUint64(pid int, addr uintptr) (uint64, error) {
buf := make([]byte, 8)
if _, err := syscall.PtracePeekData(pid, addr, buf); err != nil {
return 0, err
}
if nativeLittleEndian {
var v uint64
for i := range buf {
v |= uint64(buf[i]) << (8 * uint(i))
}
return v, nil
}
var v uint64
for i := range buf {
v = (v << 8) | uint64(buf[i])
}
return v, nil
}
func readTraceeUint32Array(pid int, addr uintptr, count uint64) ([]uint32, error) {
if count == 0 {
return nil, nil
}
if addr == 0 {
return nil, syscall.EFAULT
}
out := make([]uint32, 0, count)
for i := uint64(0); i < count; i++ {
v, err := readTraceeUint32(pid, addr+uintptr(i*4))
if err != nil {
return nil, err
}
out = append(out, v)
}
return out, nil
}
func readTraceeStringVector(pid int, addr uintptr, max int) ([]string, error) {
if addr == 0 {
return nil, nil
}
out := []string{}
for i := 0; i < max; i++ {
p, err := readTraceePtr(pid, addr+uintptr(i*ptrSize))
if err != nil {
return out, err
}
if p == 0 {
return out, nil
}
s, err := readTraceeString(pid, uintptr(p), 4096)
if err != nil {
return out, err
}
out = append(out, s)
}
return out, nil
}
func readTraceePtr(pid int, addr uintptr) (uint64, error) {
buf := make([]byte, ptrSize)
_, err := syscall.PtracePeekData(pid, addr, buf)
if err != nil {
return 0, err
}
return traceePtr(buf), nil
}
func writeScratchString(st *traceeState, s string) (uintptr, error) {
data := append([]byte(s), 0)
addr := uintptr(st.scratch - uint64(len(data)+16))
addr &^= uintptr(15)
st.scratch = uint64(addr)
_, err := syscall.PtracePokeData(st.pid, addr, data)
return addr, err
}
func writeTraceeStringVector(st *traceeState, values []string) (uintptr, error) {
ptrs := make([]uint64, 0, len(values)+1)
for _, s := range values {
addr, err := writeScratchString(st, s)
if err != nil {
return 0, err
}
ptrs = append(ptrs, uint64(addr))
}
ptrs = append(ptrs, 0)
data := make([]byte, len(ptrs)*ptrSize)
for i, p := range ptrs {
base := i * ptrSize
putTraceePtr(data[base:base+ptrSize], p)
}
addr := uintptr(st.scratch - uint64(len(data)+16))
addr &^= uintptr(15)
st.scratch = uint64(addr)
_, err := syscall.PtracePokeData(st.pid, addr, data)
return addr, err
}
func traceeUint32(buf []byte) uint32 {
if nativeLittleEndian {
return uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
}
return uint32(buf[3]) | uint32(buf[2])<<8 | uint32(buf[1])<<16 | uint32(buf[0])<<24
}
func putTraceeUint32(buf []byte, v uint32) {
if nativeLittleEndian {
buf[0] = byte(v)
buf[1] = byte(v >> 8)
buf[2] = byte(v >> 16)
buf[3] = byte(v >> 24)
return
}
buf[0] = byte(v >> 24)
buf[1] = byte(v >> 16)
buf[2] = byte(v >> 8)
buf[3] = byte(v)
}
func traceePtr(buf []byte) uint64 {
var v uint64
if nativeLittleEndian {
for i := 0; i < ptrSize; i++ {
v |= uint64(buf[i]) << (8 * uint(i))
}
return v
}
for i := 0; i < ptrSize; i++ {
v = (v << 8) | uint64(buf[i])
}
return v
}
func putTraceePtr(buf []byte, v uint64) {
if nativeLittleEndian {
for i := 0; i < ptrSize; i++ {
buf[i] = byte(v >> (8 * uint(i)))
}
return
}
for i := 0; i < ptrSize; i++ {
buf[i] = byte(v >> (8 * uint(ptrSize-1-i)))
}
}