From 3996d6d032b4fd1eb4579c677dfb6ae18ce2235d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 14 Jun 2026 20:52:42 +0000 Subject: [PATCH] fix(cleanup): kill Unix step process group on cancel to avoid hang (#1025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cancelling a job on a Linux/macOS host runner can leave the spawned process tree running and hang the runner — the same failure mode fixed for Windows in #1011, just on the other platforms. Steps are launched as process-group leaders (`Setpgid`, or `Setsid` for the PTY path), but the default `exec.CommandContext` cancellation only kills the **direct child**. When a step launches a shell that starts a child which in turn spawns further background processes, cancelling the job leaves the descendants running. Because those orphans inherited the step's stdout/stderr pipe, the read end never hits EOF and `cmd.Wait()` blocks forever. Because the step executor never returns: - the orphaned processes keep running (the cancelled work is not actually stopped), and - end-of-job cleanup is never reached, so the runner appears to go offline / stop picking up jobs. ## Fix Apply the same tree-kill approach as Windows, using the Unix counterpart of a Job Object: the **process group**. - Add a Unix `processKiller` (`process_unix.go`) that captures the step's PGID (== PID, since the step is launched as a group leader) and sends `SIGKILL` to the whole group on cancellation. This also closes the inherited pipe handles so `cmd.Wait()` can return. `ESRCH` (group already gone) is not treated as an error. - Restrict the previous no-op stub (`process_other.go`) to `plan9` and have it fall back to a single-process kill, preserving plan9's prior behaviour. - Wire `cmd.Cancel` (tree kill) and `cmd.WaitDelay` (10s) **unconditionally** in `exec()` instead of Windows-only. `WaitDelay` also covers a step that backgrounds a process holding the pipe open after the main process exits. Reviewed-on: https://gitea.com/gitea/runner/pulls/1025 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com> --- act/container/host_environment.go | 59 +++++++++-------- act/container/process_other.go | 24 +++++-- act/container/process_unix.go | 56 ++++++++++++++++ act/container/process_unix_test.go | 100 +++++++++++++++++++++++++++++ 4 files changed, 202 insertions(+), 37 deletions(-) create mode 100644 act/container/process_unix.go create mode 100644 act/container/process_unix_test.go diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 23385e7a..5c69eb13 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -323,28 +323,28 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st cmd.Dir = wd cmd.SysProcAttr = getSysProcAttr(cmdline, false) - // On Windows a step often launches a process tree (a shell that starts a - // child which spawns further GUI or background processes). The default - // context cancellation only kills the direct child, leaving the rest of the - // tree running; and because the orphans inherit cmd's stdout/stderr pipe, - // cmd.Wait() would block forever, hanging the runner. Kill the whole tree - // via a Job Object on cancellation, and bound the wait so a leftover pipe - // writer can never hang Wait indefinitely. + // A step often launches a process tree (a shell that starts a child which + // spawns further background or GUI processes). The default context + // cancellation only kills the direct child, leaving the rest of the tree + // running; and because the orphans inherit cmd's stdout/stderr pipe, + // cmd.Wait() would block forever, hanging the runner. Kill the whole tree on + // cancellation — via a Job Object on Windows and the process group on Unix + // (see processKiller) — and bound the wait so a leftover pipe writer can + // never hang Wait indefinitely. var killer atomic.Pointer[processKiller] - if runtime.GOOS == "windows" { - cmd.Cancel = func() error { - if k := killer.Load(); k != nil { - return k.Kill() - } - if cmd.Process != nil { - return cmd.Process.Kill() - } - return nil + cmd.Cancel = func() error { + if k := killer.Load(); k != nil { + return k.Kill() } - // Once the step process has exited, give its I/O pipes at most this long - // to drain before Wait force-closes them and returns (Go's WaitDelay). - cmd.WaitDelay = 10 * time.Second + if cmd.Process != nil { + return cmd.Process.Kill() + } + return nil } + // Once the step process has exited, give its I/O pipes at most this long to + // drain before Wait force-closes them and returns (Go's WaitDelay). This + // also covers a step that backgrounds a process holding the pipe open. + cmd.WaitDelay = 10 * time.Second var ppty *os.File var tty *os.File @@ -375,17 +375,16 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st if err := cmd.Start(); err != nil { return err } - if runtime.GOOS == "windows" { - // Assign the started process to a Job Object so cmd.Cancel can kill the - // whole descendant tree. Children spawned afterwards are auto-included. - // On failure (e.g. nested-job restrictions) we fall back to the default - // single-process kill; WaitDelay + end-of-job cleanup still apply. - if k, kerr := newProcessKiller(cmd.Process); kerr != nil { - common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr) - } else { - killer.Store(k) - defer k.Close() - } + // Capture the started process for tree-kill on cancellation: a Job Object on + // Windows (children spawned afterwards are auto-included) and the process + // group on Unix. On failure (e.g. Windows nested-job restrictions) we fall + // back to the default single-process kill; WaitDelay + end-of-job cleanup + // still apply. + if k, kerr := newProcessKiller(cmd.Process); kerr != nil { + common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr) + } else { + killer.Store(k) + defer k.Close() } err = cmd.Wait() if err != nil { diff --git a/act/container/process_other.go b/act/container/process_other.go index a08e60b0..ffb1d14a 100644 --- a/act/container/process_other.go +++ b/act/container/process_other.go @@ -1,19 +1,29 @@ // Copyright 2026 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -//go:build !windows +//go:build plan9 package container import "os" -// processKiller is a no-op on non-Windows platforms. The Job Object based -// tree-kill is only wired in on Windows (see exec()); elsewhere the default -// exec.CommandContext cancellation and Setpgid handling apply. -type processKiller struct{} +// processKiller falls back to single-process termination on platforms without +// a process-group / Job Object tree-kill. The Job Object (Windows) and process +// group (Unix) based tree-kills live in process_windows.go / process_unix.go; +// here we just kill the direct child, matching the previous default behaviour. +type processKiller struct { + p *os.Process +} -func newProcessKiller(_ *os.Process) (*processKiller, error) { return &processKiller{}, nil } +func newProcessKiller(p *os.Process) (*processKiller, error) { + return &processKiller{p: p}, nil +} -func (k *processKiller) Kill() error { return nil } +func (k *processKiller) Kill() error { + if k == nil || k.p == nil { + return nil + } + return k.p.Kill() +} func (k *processKiller) Close() error { return nil } diff --git a/act/container/process_unix.go b/act/container/process_unix.go new file mode 100644 index 00000000..49b43525 --- /dev/null +++ b/act/container/process_unix.go @@ -0,0 +1,56 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows && !plan9 + +package container + +import ( + "errors" + "os" + "syscall" +) + +// processKiller terminates a step process together with its whole process +// group, which is the Unix counterpart of the Windows Job Object tree-kill. +// +// Background: a step often launches a process tree (a shell that starts a child +// which in turn spawns further background processes). The default +// exec.CommandContext cancellation only kills the direct child, so cancelling a +// job left the rest of the tree running. Because those orphans inherited the +// step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner +// hung. +// +// Steps are started with Setpgid (or Setsid for the PTY path, see +// getSysProcAttr), which makes the step process the leader of a new process +// group whose ID equals its PID. Signalling the negative PID delivers to every +// process still in that group, so we can tear down the whole tree atomically on +// cancellation, which also closes the inherited pipe handles so cmd.Wait() can +// return. +type processKiller struct { + pgid int +} + +// newProcessKiller captures the process group of p (an already-started +// process). Because the step is launched with Setpgid/Setsid, p is a group +// leader and its PGID equals its PID; children spawned afterwards stay in the +// same group unless they explicitly create their own. +func newProcessKiller(p *os.Process) (*processKiller, error) { + return &processKiller{pgid: p.Pid}, nil +} + +// Kill sends SIGKILL to the entire process group (the step process and every +// descendant that stayed in the group). A missing group (ESRCH) means the +// processes already exited and is not treated as an error. +func (k *processKiller) Kill() error { + if k == nil || k.pgid <= 0 { + return nil + } + if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) { + return err + } + return nil +} + +// Close is a no-op on Unix; there is no job handle to release. +func (k *processKiller) Close() error { return nil } diff --git a/act/container/process_unix_test.go b/act/container/process_unix_test.go new file mode 100644 index 00000000..02315592 --- /dev/null +++ b/act/container/process_unix_test.go @@ -0,0 +1,100 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows && !plan9 + +package container + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// processAlive reports whether pid refers to a still-running process. Signal 0 +// performs error checking without delivering a signal: a nil error (or EPERM) +// means the process exists, ESRCH means it is gone. +// +// On Linux, zombie processes (state Z in /proc//stat) appear alive to +// kill(0) but have already terminated — their corpse lingers until the parent +// calls wait(). In a Docker container the child may be reparented to a PID 1 +// that does not reap promptly, so we treat zombies as not alive. +func processAlive(pid int) bool { + err := syscall.Kill(pid, 0) + if err != nil { + return false + } + // On Linux /proc is available; check whether the process is a zombie. + if b, readErr := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)); readErr == nil { + // Format: "pid (comm) state ..." — state follows the closing ')' of the + // command name (which may itself contain spaces and parens). + rest := string(b) + if idx := strings.LastIndex(rest, ") "); idx >= 0 { + fields := strings.Fields(rest[idx+2:]) + if len(fields) > 0 && fields[0] == "Z" { + return false // zombie: terminated but not yet reaped + } + } + } + return true +} + +// TestProcessKillerKillsTree verifies that a process group captured by the +// killer is terminated together with a child the step spawns afterwards. This +// mirrors a step that launches a child which spawns further processes, where +// cancelling the job must take down the whole tree, not just the direct child. +func TestProcessKillerKillsTree(t *testing.T) { + dir := t.TempDir() + pidFile := filepath.Join(dir, "child.pid") + + // Parent shell backgrounds a long-lived child (writing its PID to a file) + // and then sleeps. With job control off (non-interactive sh) the backgrounded + // child stays in the parent's process group, so the group kill must reach it. + script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile) + cmd := exec.Command("/bin/sh", "-c", script) + // Launch as its own process-group leader, exactly like a real step does (see + // getSysProcAttr), so the killer's PGID == the process PID. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + require.NoError(t, cmd.Start()) + t.Cleanup(func() { + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + _ = cmd.Wait() + }) + + killer, err := newProcessKiller(cmd.Process) + require.NoError(t, err) + defer killer.Close() + + // Wait for the backgrounded child PID to be reported. + var childPID int + require.Eventually(t, func() bool { + b, e := os.ReadFile(pidFile) + if e != nil { + return false + } + s := strings.TrimSpace(string(b)) + if s == "" { + return false + } + childPID, _ = strconv.Atoi(s) + return childPID > 0 && processAlive(childPID) + }, 20*time.Second, 100*time.Millisecond, "child process should start") + + // Killing the group must terminate both the parent and the backgrounded child. + require.NoError(t, killer.Kill()) + // Reap the parent so it does not linger as a zombie (which would still report + // as alive); SIGKILL makes Wait return promptly. + _ = cmd.Wait() + + require.Eventually(t, func() bool { + return !processAlive(childPID) + }, 20*time.Second, 100*time.Millisecond, "backgrounded child should be terminated") +}