3 Commits

Author SHA1 Message Date
silverwind
594c9ade7c Align step failure log output with GitHub Actions (#927)
Fixes #926.

Before:

<img src="/attachments/a5ae9221-eee2-410a-964e-6103ce126df4" alt="image.png" width="400">

After:

<img width="400" alt="image.png" src="attachments/2f2d67c4-6080-4ec3-9ae5-df33e6479920">

Also gets rid of a bunch of emojis in the logging and the obsolete link to `nektos/act` and align some other error messages.

---
This PR was written with the help of Claude Opus 4.7

---------

Co-authored-by: Nicolas <bircni@icloud.com>
Reviewed-on: https://gitea.com/gitea/runner/pulls/927
Reviewed-by: Nicolas <bircni@icloud.com>
Co-authored-by: silverwind <me@silverwind.io>
Co-committed-by: silverwind <me@silverwind.io>
2026-05-05 20:17:32 +00:00
Nicolas
2a4d56c650 feat: add startup janitor for stale bind-workdir task workspaces (#870)
- Add idle-time cleanup for stale bind-workdir task directories instead of cleaning them on the task execution path.
- Make cleanup behavior configurable with `runner.startup_cleanup_age` as the stale-age threshold (default: `24h`) and `runner.idle_cleanup_interval` as the idle cleanup cadence (default: `10m`).
- Restrict cleanup scope to numeric task directory names only, to avoid touching operator-managed folders.
- Document the cleanup settings in `config.example.yaml` and `README.md`.
- Add tests for stale-directory cleanup, idle cleanup throttling, and config default/override parsing.

## Why

When a runner or host crashes, normal per-task cleanup may not run, leaving stale task directories under the bind-workdir root. Running this cleanup only while the runner is idle recovers that disk space without adding overhead to active job execution.

If you want, I can also tighten the wording around `startup_cleanup_age`, since the key name now reads a bit misleadingly relative to the actual behavior.

---------

Co-authored-by: silverwind <me@silverwind.io>
Reviewed-on: https://gitea.com/gitea/runner/pulls/870
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
2026-05-05 20:11:44 +00:00
Nicolas
a22119cf88 fix(host): correct host workspace cleanup on Windows (#883)
## Summary
- Fix host-mode cleanup to remove the job **workspace** directory after a run (instead of leaving checkouts behind).
- On Windows, track step process PIDs and terminate remaining process trees during teardown before attempting workspace deletion (prevents file-lock failures).
- Skip workspace deletion when `bind_workdir` is enabled to avoid conflicting with runner-level task directory cleanup.

## Implementation details
- `HostEnvironment` now records PIDs for started commands and best-effort terminates them on Windows during `Remove()`.
- Workspace removal uses a small retry loop on Windows to handle transient locks.
- `BindWorkdir` is propagated into `HostEnvironment` so cleanup behavior matches runner configuration.

---------

Co-authored-by: silverwind <me@silverwind.io>
Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com>
Reviewed-on: https://gitea.com/gitea/runner/pulls/883
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
2026-05-05 18:28:12 +00:00
16 changed files with 835 additions and 40 deletions

View File

@@ -132,6 +132,12 @@ Besides `GITEA_INSTANCE_URL` and `GITEA_RUNNER_REGISTRATION_TOKEN`, the image en
For a fuller container-oriented walkthrough, see [examples/docker](examples/docker/README.md).
When `container.bind_workdir` is enabled, stale task workspace directories can be cleaned while the runner is idle:
- directories older than `runner.workdir_cleanup_age` are removed (default: `24h`; set `0` to disable)
- cleanup runs every `runner.idle_cleanup_interval` (default: `10m`; set `0` to disable)
- only purely numeric subdirectories under `container.workdir_parent` are treated as task workspaces and may be removed
- cleanup assumes `container.workdir_parent` is not shared across multiple runners
### Example Deployments
Check out the [examples](examples) directory for sample deployment types.

View File

@@ -6,6 +6,7 @@ package container
import (
"context"
"fmt"
"io"
"gitea.com/gitea/runner/act/common"
@@ -13,6 +14,13 @@ import (
"github.com/docker/go-connections/nat"
)
// ExitCodeError reports a non-zero process exit code from a container command.
type ExitCodeError int
func (e ExitCodeError) Error() string {
return fmt.Sprintf("Process completed with exit code %d.", int(e))
}
// NewContainerInput the input for the New function
type NewContainerInput struct {
Image string

View File

@@ -633,14 +633,10 @@ func (cr *containerReference) exec(cmd []string, env map[string]string, user, wo
return fmt.Errorf("failed to inspect exec: %w", err)
}
switch inspectResp.ExitCode {
case 0:
if inspectResp.ExitCode == 0 {
return nil
case 127:
return fmt.Errorf("exitcode '%d': command not found, please refer to https://github.com/nektos/act/issues/107 for more information", inspectResp.ExitCode)
default:
return fmt.Errorf("exitcode '%d': failure", inspectResp.ExitCode)
}
return ExitCodeError(inspectResp.ExitCode)
}
}
@@ -930,7 +926,7 @@ func (cr *containerReference) wait() common.Executor {
return nil
}
return fmt.Errorf("exit with `FAILURE`: %v", statusCode)
return ExitCodeError(statusCode)
}
}

View File

@@ -23,6 +23,7 @@ import (
"github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)
func TestDocker(t *testing.T) {
@@ -85,6 +86,11 @@ func (m *mockDockerClient) ContainerExecInspect(ctx context.Context, execID stri
return args.Get(0).(types.ContainerExecInspect), args.Error(1)
}
func (m *mockDockerClient) ContainerWait(ctx context.Context, containerID string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) {
args := m.Called(ctx, containerID, condition)
return args.Get(0).(<-chan container.WaitResponse), args.Get(1).(<-chan error)
}
func (m *mockDockerClient) CopyToContainer(ctx context.Context, id, path string, content io.Reader, options types.CopyToContainerOptions) error {
args := m.Called(ctx, id, path, content, options)
return args.Error(0)
@@ -174,12 +180,43 @@ func TestDockerExecFailure(t *testing.T) {
}
err := cr.exec([]string{""}, map[string]string{}, "user", "workdir")(ctx)
assert.Error(t, err, "exit with `FAILURE`: 1") //nolint:testifylint // pre-existing issue from nektos/act
var exitErr ExitCodeError
require.ErrorAs(t, err, &exitErr)
assert.Equal(t, ExitCodeError(1), exitErr)
assert.Equal(t, "Process completed with exit code 1.", err.Error())
conn.AssertExpectations(t)
client.AssertExpectations(t)
}
func TestDockerWaitFailure(t *testing.T) {
ctx := context.Background()
statusCh := make(chan container.WaitResponse, 1)
statusCh <- container.WaitResponse{StatusCode: 2}
errCh := make(chan error, 1)
client := &mockDockerClient{}
client.On("ContainerWait", ctx, "123", container.WaitConditionNotRunning).
Return((<-chan container.WaitResponse)(statusCh), (<-chan error)(errCh))
cr := &containerReference{
id: "123",
cli: client,
input: &NewContainerInput{
Image: "image",
},
}
err := cr.wait()(ctx)
var exitErr ExitCodeError
require.ErrorAs(t, err, &exitErr)
assert.Equal(t, ExitCodeError(2), exitErr)
assert.Equal(t, "Process completed with exit code 2.", err.Error())
client.AssertExpectations(t)
}
func TestDockerCopyTarStream(t *testing.T) {
ctx := context.Background()

View File

@@ -16,7 +16,9 @@ import (
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
"gitea.com/gitea/runner/act/common"
@@ -34,9 +36,15 @@ type HostEnvironment struct {
TmpDir string
ToolCache string
Workdir string
ActPath string
CleanUp func()
StdOut io.Writer
// BindWorkdir is true when the app runner mounts the workspace on the host and
// deletes the task directory after the job; host teardown must not remove Workdir.
BindWorkdir bool
ActPath string
CleanUp func()
StdOut io.Writer
mu sync.Mutex
runningPIDs map[int]struct{}
}
func (e *HostEnvironment) Create(_, _ []string) common.Executor {
@@ -344,8 +352,30 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
if ppty != nil {
go writeKeepAlive(ppty)
}
err = cmd.Run()
// Split Start/Wait so the PID can be registered before the process can exit;
// cmd.Run() would block until exit, by which time the PID may have been reused.
if err := cmd.Start(); err != nil {
return err
}
if cmd.Process != nil {
e.mu.Lock()
if e.runningPIDs == nil {
e.runningPIDs = map[int]struct{}{}
}
e.runningPIDs[cmd.Process.Pid] = struct{}{}
e.mu.Unlock()
defer func(pid int) {
e.mu.Lock()
delete(e.runningPIDs, pid)
e.mu.Unlock()
}(cmd.Process.Pid)
}
err = cmd.Wait()
if err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return ExitCodeError(exitErr.ExitCode())
}
return err
}
if tty != nil {
@@ -385,12 +415,83 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string)
return parseEnvFile(e, srcPath, env)
}
func removePathWithRetry(ctx context.Context, path string) error {
if path == "" {
return nil
}
attempts := 1
delay := time.Duration(0)
if runtime.GOOS == "windows" {
attempts = 5
delay = 200 * time.Millisecond
}
var lastErr error
for i := 0; i < attempts; i++ {
if i > 0 {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(delay):
}
}
lastErr = os.RemoveAll(path)
if lastErr == nil {
return nil
}
}
return lastErr
}
func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) {
if runtime.GOOS != "windows" {
return
}
e.mu.Lock()
pids := make([]int, 0, len(e.runningPIDs))
for pid := range e.runningPIDs {
pids = append(pids, pid)
}
e.mu.Unlock()
if len(pids) == 0 {
return
}
logger := common.Logger(ctx)
for _, pid := range pids {
// Best-effort: forcibly terminate process tree to release file handles
// so that workspace cleanup can succeed on Windows.
cmd := exec.CommandContext(ctx, "taskkill", "/PID", strconv.Itoa(pid), "/T", "/F")
out, err := cmd.CombinedOutput()
if err != nil {
logger.Debugf("taskkill failed for pid=%d: %v output=%s", pid, err, strings.TrimSpace(string(out)))
}
}
}
func (e *HostEnvironment) Remove() common.Executor {
return func(ctx context.Context) error {
// Ensure any lingering child processes are ended before attempting
// to remove the workspace (Windows file locks otherwise prevent cleanup).
e.terminateRunningProcesses(ctx)
// Only removes per-job misc state. Must not remove the cache/toolcache root.
if e.CleanUp != nil {
e.CleanUp()
}
return os.RemoveAll(e.Path)
logger := common.Logger(ctx)
var errs []error
if err := removePathWithRetry(ctx, e.Path); err != nil {
logger.Warnf("failed to remove host misc state %s: %v", e.Path, err)
errs = append(errs, err)
}
if !e.BindWorkdir && e.Workdir != "" {
if err := removePathWithRetry(ctx, e.Workdir); err != nil {
logger.Warnf("failed to remove host workspace %s: %v", e.Workdir, err)
errs = append(errs, err)
}
}
return errors.Join(errs...)
}
}

View File

@@ -11,9 +11,14 @@ import (
"os"
"path"
"path/filepath"
"runtime"
"testing"
"gitea.com/gitea/runner/act/common"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// Type assert HostEnvironment implements ExecutionsEnvironment
@@ -69,3 +74,76 @@ func TestGetContainerArchive(t *testing.T) {
_, err = reader.Next()
assert.ErrorIs(t, err, io.EOF)
}
func TestHostEnvironmentExecExitCode(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("uses POSIX shell")
}
dir := t.TempDir()
ctx := context.Background()
e := &HostEnvironment{
Path: filepath.Join(dir, "path"),
TmpDir: filepath.Join(dir, "tmp"),
ToolCache: filepath.Join(dir, "tool_cache"),
ActPath: filepath.Join(dir, "act_path"),
StdOut: io.Discard,
Workdir: filepath.Join(dir, "path"),
}
for _, p := range []string{e.Path, e.TmpDir, e.ToolCache, e.ActPath} {
assert.NoError(t, os.MkdirAll(p, 0o700)) //nolint:testifylint // test setup
}
err := e.Exec([]string{"sh", "-c", "exit 3"}, map[string]string{"PATH": os.Getenv("PATH")}, "", "")(ctx)
var exitErr ExitCodeError
require.ErrorAs(t, err, &exitErr)
assert.Equal(t, ExitCodeError(3), exitErr)
assert.Equal(t, "Process completed with exit code 3.", err.Error())
}
func TestHostEnvironmentRemoveCleansWorkdir(t *testing.T) {
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
miscRoot := filepath.Join(base, "misc")
path := filepath.Join(miscRoot, "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
workdir := filepath.Join(base, "workspace", "owner", "repo")
require.NoError(t, os.MkdirAll(workdir, 0o700))
e := &HostEnvironment{
Path: path,
Workdir: workdir,
BindWorkdir: false,
CleanUp: func() {
_ = os.RemoveAll(miscRoot)
},
StdOut: os.Stdout,
}
require.NoError(t, e.Remove()(ctx))
_, err := os.Stat(workdir)
assert.ErrorIs(t, err, os.ErrNotExist)
}
func TestHostEnvironmentRemoveSkipsWorkdirWhenBindWorkdir(t *testing.T) {
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
miscRoot := filepath.Join(base, "misc")
path := filepath.Join(miscRoot, "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
workdir := filepath.Join(base, "workspace", "123", "owner", "repo")
require.NoError(t, os.MkdirAll(workdir, 0o700))
e := &HostEnvironment{
Path: path,
Workdir: workdir,
BindWorkdir: true,
CleanUp: func() {
_ = os.RemoveAll(miscRoot)
},
StdOut: os.Stdout,
}
require.NoError(t, e.Remove()(ctx))
_, err := os.Stat(workdir)
require.NoError(t, err)
}

View File

@@ -24,6 +24,13 @@ type jobInfo interface {
result(result string)
}
// reportStepError emits the GitHub Actions ##[error] annotation and records
// the error against the job so the job is reported as failed.
func reportStepError(ctx context.Context, err error) {
common.Logger(ctx).Errorf("##[error]%v", err)
common.SetJobError(ctx, err)
}
func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executor {
steps := make([]common.Executor, 0)
preSteps := make([]common.Executor, 0)
@@ -32,7 +39,7 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
steps = append(steps, func(ctx context.Context) error {
logger := common.Logger(ctx)
if len(info.matrix()) > 0 {
logger.Infof("\U0001F9EA Matrix: %v", info.matrix())
logger.Infof("Matrix: %v", info.matrix())
}
return nil
})
@@ -75,33 +82,36 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
preExec := step.pre()
preSteps = append(preSteps, useStepLogger(rc, stepModel, stepStagePre, func(ctx context.Context) error {
logger := common.Logger(ctx)
preErr := preExec(ctx)
if preErr != nil {
logger.Errorf("%v", preErr)
common.SetJobError(ctx, preErr)
reportStepError(ctx, preErr)
} else if ctx.Err() != nil {
logger.Errorf("%v", ctx.Err())
common.SetJobError(ctx, ctx.Err())
reportStepError(ctx, ctx.Err())
}
return preErr
}))
stepExec := step.main()
steps = append(steps, useStepLogger(rc, stepModel, stepStageMain, func(ctx context.Context) error {
logger := common.Logger(ctx)
err := stepExec(ctx)
if err != nil {
logger.Errorf("%v", err)
common.SetJobError(ctx, err)
reportStepError(ctx, err)
} else if ctx.Err() != nil {
logger.Errorf("%v", ctx.Err())
common.SetJobError(ctx, ctx.Err())
reportStepError(ctx, ctx.Err())
}
return nil
}))
postExec := useStepLogger(rc, stepModel, stepStagePost, step.post())
postFn := step.post()
postExec := useStepLogger(rc, stepModel, stepStagePost, func(ctx context.Context) error {
err := postFn(ctx)
if err != nil {
reportStepError(ctx, err)
} else if ctx.Err() != nil {
reportStepError(ctx, ctx.Err())
}
return err
})
if postExecutor != nil {
// run the post executor in reverse order
postExecutor = postExec.Finally(postExecutor)
@@ -196,7 +206,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
jobResultMessage = "failed"
}
logger.WithField("jobResult", jobResult).Infof("\U0001F3C1 Job %s", jobResultMessage)
logger.WithField("jobResult", jobResult).Infof("Job %s", jobResultMessage)
}
func setJobOutputs(ctx context.Context, rc *RunContext) {

View File

@@ -220,11 +220,12 @@ func (rc *RunContext) startHostEnvironment() common.Executor {
}
toolCache := filepath.Join(cacheDir, "tool_cache")
rc.JobContainer = &container.HostEnvironment{
Path: path,
TmpDir: runnerTmp,
ToolCache: toolCache,
Workdir: rc.Config.Workdir,
ActPath: actPath,
Path: path,
TmpDir: runnerTmp,
ToolCache: toolCache,
Workdir: rc.Config.Workdir,
BindWorkdir: rc.Config.BindWorkdir,
ActPath: actPath,
CleanUp: func() {
os.RemoveAll(miscpath)
},
@@ -729,7 +730,7 @@ func (rc *RunContext) isEnabled(ctx context.Context) (bool, error) {
jobType, jobTypeErr := job.Type()
if runJobErr != nil {
return false, fmt.Errorf(" \u274C Error in if-expression: \"if: %s\" (%s)", job.If.Value, runJobErr)
return false, fmt.Errorf("if-expression %q evaluation failed: %s", job.If.Value, runJobErr)
}
if jobType == model.JobTypeInvalid {

View File

@@ -107,7 +107,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
if strings.Contains(stepString, "::add-mask::") {
stepString = "add-mask command"
}
logger.Infof("\u2B50 Run %s %s", stage, stepString)
logger.Infof("Run %s %s", stage, stepString)
// Prepare and clean Runner File Commands
actPath := rc.JobContainer.GetActPath()
@@ -158,7 +158,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
err = executor(timeoutctx)
if err == nil {
logger.WithField("stepResult", stepResult.Outcome).Infof(" \u2705 Success - %s %s", stage, stepString)
logger.WithField("stepResult", stepResult.Outcome).Infof("Success - %s %s", stage, stepString)
} else {
stepResult.Outcome = model.StepStatusFailure
@@ -169,6 +169,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
}
if continueOnError {
logger.Errorf("##[error]%v", err)
logger.Infof("Failed but continue next step")
err = nil
stepResult.Conclusion = model.StepStatusSuccess
@@ -176,7 +177,9 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
stepResult.Conclusion = model.StepStatusFailure
}
logger.WithField("stepResult", stepResult.Outcome).Errorf(" \u274C Failure - %s %s", stage, stepString)
// Infof: Errorf entries are promoted to the user log by the reporter,
// which would duplicate the ##[error] annotation emitted elsewhere.
logger.WithField("stepResult", stepResult.Outcome).Infof("Failure - %s %s", stage, stepString)
}
// Process Runner File Commands
orgerr := err
@@ -268,7 +271,7 @@ func isStepEnabled(ctx context.Context, expr string, step step, stage stepStage)
runStep, err := EvalBool(ctx, rc.NewStepExpressionEvaluator(ctx, step), expr, defaultStatusCheck)
if err != nil {
return false, fmt.Errorf(" \u274C Error in if-expression: \"if: %s\" (%s)", expr, err)
return false, fmt.Errorf("if-expression %q evaluation failed: %s", expr, err)
}
return runStep, nil
@@ -284,7 +287,7 @@ func isContinueOnError(ctx context.Context, expr string, step step, _ stepStage)
continueOnError, err := EvalBool(ctx, rc.NewStepExpressionEvaluator(ctx, step), expr, exprparser.DefaultStatusCheckNone)
if err != nil {
return false, fmt.Errorf(" \u274C Error in continue-on-error-expression: \"continue-on-error: %s\" (%s)", expr, err)
return false, fmt.Errorf("continue-on-error expression %q evaluation failed: %s", expr, err)
}
return continueOnError, nil

View File

@@ -27,6 +27,11 @@ type TaskRunner interface {
Run(ctx context.Context, task *runnerv1.Task) error
}
// IdleRunner can run maintenance while the poller is idle.
type IdleRunner interface {
OnIdle(ctx context.Context)
}
type Poller struct {
client client.Client
runner TaskRunner
@@ -95,6 +100,7 @@ func (p *Poller) Poll() {
task, ok := p.fetchTask(p.pollingCtx, s)
if !ok {
p.runIdleMaintenance()
<-sem
if !p.waitBackoff(s) {
return
@@ -119,6 +125,7 @@ func (p *Poller) PollOnce() {
for {
task, ok := p.fetchTask(p.pollingCtx, s)
if !ok {
p.runIdleMaintenance()
if !p.waitBackoff(s) {
return
}
@@ -130,6 +137,12 @@ func (p *Poller) PollOnce() {
}
}
func (p *Poller) runIdleMaintenance() {
if idleRunner, ok := p.runner.(IdleRunner); ok {
idleRunner.OnIdle(p.jobsCtx)
}
}
func (p *Poller) Shutdown(ctx context.Context) error {
p.shutdownPolling()

View File

@@ -125,6 +125,11 @@ type mockRunner struct {
totalCompleted atomic.Int64
}
type idleAwareRunner struct {
mockRunner
idleCalls atomic.Int64
}
func (m *mockRunner) Run(ctx context.Context, _ *runnerv1.Task) error {
atomicMax(&m.maxConcurrent, m.running.Add(1))
select {
@@ -136,6 +141,78 @@ func (m *mockRunner) Run(ctx context.Context, _ *runnerv1.Task) error {
return nil
}
func TestPollerRunIdleMaintenance(t *testing.T) {
runner := &idleAwareRunner{}
p := &Poller{runner: runner, jobsCtx: context.Background()}
p.runIdleMaintenance()
assert.Equal(t, int64(1), runner.idleCalls.Load())
}
func (m *idleAwareRunner) OnIdle(_ context.Context) {
m.idleCalls.Add(1)
}
func TestPollerPollCallsOnIdle(t *testing.T) {
cli := mocks.NewClient(t)
cli.On("FetchTask", mock.Anything, mock.Anything).Return(
func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
return connect_go.NewResponse(&runnerv1.FetchTaskResponse{}), nil
},
)
cfg, err := config.LoadDefault("")
require.NoError(t, err)
cfg.Runner.Capacity = 1
cfg.Runner.FetchInterval = 10 * time.Millisecond
cfg.Runner.FetchIntervalMax = 10 * time.Millisecond
runner := &idleAwareRunner{}
poller := New(cfg, cli, runner)
var wg sync.WaitGroup
wg.Go(poller.Poll)
require.Eventually(t, func() bool {
return runner.idleCalls.Load() > 0
}, time.Second, 10*time.Millisecond)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
require.NoError(t, poller.Shutdown(ctx))
wg.Wait()
}
func TestPollerPollOnceCallsOnIdle(t *testing.T) {
cli := mocks.NewClient(t)
cli.On("FetchTask", mock.Anything, mock.Anything).Return(
func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
return connect_go.NewResponse(&runnerv1.FetchTaskResponse{}), nil
},
)
cfg, err := config.LoadDefault("")
require.NoError(t, err)
cfg.Runner.FetchInterval = 10 * time.Millisecond
cfg.Runner.FetchIntervalMax = 10 * time.Millisecond
runner := &idleAwareRunner{}
poller := New(cfg, cli, runner)
var wg sync.WaitGroup
wg.Go(poller.PollOnce)
require.Eventually(t, func() bool {
return runner.idleCalls.Load() > 0
}, time.Second, 10*time.Millisecond)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
require.NoError(t, poller.Shutdown(ctx))
wg.Wait()
}
// TestPoller_ConcurrencyLimitedByCapacity verifies that with capacity=3 and
// 6 available tasks, at most 3 tasks run concurrently, and FetchTask is
// never called concurrently (single poller).

View File

@@ -7,11 +7,14 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"maps"
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -45,8 +48,10 @@ type Runner struct {
envs map[string]string
cacheHandler *artifactcache.Handler
runningTasks sync.Map
runningCount atomic.Int64
runningTasks sync.Map
runningCount atomic.Int64
lastIdleCleanupUnixNano atomic.Int64
now func() time.Time
}
func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client) *Runner {
@@ -89,13 +94,94 @@ func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client)
envs["GITEA_ACTIONS"] = "true"
envs["GITEA_ACTIONS_RUNNER_VERSION"] = ver.Version()
return &Runner{
runner := &Runner{
name: reg.Name,
cfg: cfg,
client: cli,
labels: ls,
envs: envs,
cacheHandler: cacheHandler,
now: time.Now,
}
return runner
}
// OnIdle performs lightweight maintenance during polling idle windows.
// It runs synchronously on the poller goroutine; shouldRunIdleCleanup
// throttles invocations to runner.idle_cleanup_interval so the impact on
// poll cadence is bounded even when the workdir root is large.
func (r *Runner) OnIdle(ctx context.Context) {
if !r.shouldRunIdleCleanup() {
return
}
workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/")
workdirRoot := filepath.FromSlash("/" + workdirParent)
r.cleanupStaleTaskDirs(ctx, workdirRoot)
}
func (r *Runner) shouldRunIdleCleanup() bool {
if !r.cfg.Container.BindWorkdir {
return false
}
if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 {
return false
}
if r.RunningCount() != 0 {
return false
}
now := r.now()
interval := r.cfg.Runner.IdleCleanupInterval
for {
last := r.lastIdleCleanupUnixNano.Load()
if last != 0 && now.Sub(time.Unix(0, last)) < interval {
return false
}
if r.lastIdleCleanupUnixNano.CompareAndSwap(last, now.UnixNano()) {
return true
}
}
}
func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) {
entries, err := os.ReadDir(workdirRoot)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return
}
log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err)
return
}
// A task may begin between shouldRunIdleCleanup's running-count check and
// the loop below. That is safe because new task dirs are created with the
// current mtime and therefore fall on the keep side of cutoff.
cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge)
for _, entry := range entries {
if err := ctx.Err(); err != nil {
return
}
if !entry.IsDir() {
continue
}
// Task workspaces are indexed by numeric task IDs; skip any other
// directories to avoid deleting operator-managed data under workdir_root.
if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil {
continue
}
info, err := entry.Info()
if err != nil {
log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err)
continue
}
if info.ModTime().After(cutoff) {
continue
}
taskDir := filepath.Join(workdirRoot, entry.Name())
if err := os.RemoveAll(taskDir); err != nil {
log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err)
continue
}
log.Infof("cleaned stale task workspace %s", taskDir)
}
}
@@ -238,6 +324,13 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.
workdirParent = fmt.Sprintf("%s/%d", workdirParent, task.Id)
}
workdir := filepath.FromSlash(fmt.Sprintf("/%s/%s", workdirParent, preset.Repository))
if runtime.GOOS == "windows" {
if abs, err := filepath.Abs(workdir); err == nil {
workdir = abs
}
}
// Without bind_workdir, the workspace path omits the task id; concurrent host-mode jobs
// for the same repository would share this directory and can race with per-job cleanup.
runnerConfig := &runner.Config{
// On Linux, Workdir will be like "/<parent_directory>/<owner>/<repo>"

View File

@@ -0,0 +1,247 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package run
import (
"context"
"os"
"path/filepath"
"strconv"
"testing"
"time"
"gitea.com/gitea/runner/internal/pkg/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRunnerCleanupStaleTaskDirs(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
workdirRoot := filepath.Join(t.TempDir(), "workspace")
require.NoError(t, os.MkdirAll(workdirRoot, 0o700))
oldTask := filepath.Join(workdirRoot, "1001")
freshTask := filepath.Join(workdirRoot, "1002")
nonTask := filepath.Join(workdirRoot, "shared")
alphaNumericTask := filepath.Join(workdirRoot, "123abc")
for _, path := range []string{oldTask, freshTask, nonTask, alphaNumericTask} {
require.NoError(t, os.MkdirAll(path, 0o700))
}
require.NoError(t, os.Chtimes(oldTask, now.Add(-3*time.Hour), now.Add(-3*time.Hour)))
require.NoError(t, os.Chtimes(freshTask, now.Add(-30*time.Minute), now.Add(-30*time.Minute)))
require.NoError(t, os.Chtimes(nonTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour)))
require.NoError(t, os.Chtimes(alphaNumericTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour)))
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{
WorkdirCleanupAge: 2 * time.Hour,
},
},
now: func() time.Time { return now },
}
r.cleanupStaleTaskDirs(context.Background(), workdirRoot)
assert.NoDirExists(t, oldTask)
assert.DirExists(t, freshTask)
assert.DirExists(t, nonTask)
assert.DirExists(t, alphaNumericTask)
}
func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{WorkdirCleanupAge: time.Hour},
},
now: time.Now,
}
// Must be a silent no-op rather than a warning or panic when the root
// has not yet been created (e.g. the runner has never executed a task).
r.cleanupStaleTaskDirs(context.Background(), filepath.Join(t.TempDir(), "missing"))
}
func TestRunnerCleanupStaleTaskDirsHonorsContext(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
workdirRoot := filepath.Join(t.TempDir(), "workspace")
require.NoError(t, os.MkdirAll(workdirRoot, 0o700))
for i := 1001; i <= 1003; i++ {
dir := filepath.Join(workdirRoot, strconv.Itoa(i))
require.NoError(t, os.MkdirAll(dir, 0o700))
require.NoError(t, os.Chtimes(dir, now.Add(-3*time.Hour), now.Add(-3*time.Hour)))
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{WorkdirCleanupAge: time.Hour},
},
now: func() time.Time { return now },
}
r.cleanupStaleTaskDirs(ctx, workdirRoot)
for i := 1001; i <= 1003; i++ {
assert.DirExists(t, filepath.Join(workdirRoot, strconv.Itoa(i)))
}
}
func TestRunnerShouldRunIdleCleanupThrottles(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Hour,
},
},
now: func() time.Time { return now },
}
assert.True(t, r.shouldRunIdleCleanup())
now = now.Add(30 * time.Minute)
assert.False(t, r.shouldRunIdleCleanup())
now = now.Add(31 * time.Minute)
assert.True(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: time.Now,
}
r.runningCount.Store(1)
assert.False(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: time.Now,
}
assert.False(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
t.Run("cleanup age disabled", func(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: -1,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
assert.False(t, r.shouldRunIdleCleanup())
})
t.Run("idle interval disabled", func(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: -1,
},
},
now: func() time.Time { return now },
}
assert.False(t, r.shouldRunIdleCleanup())
})
}
// TestRunnerOnIdleIntegratesCleanup wires the full OnIdle entry point and
// confirms it walks workdir_parent (after the leading-slash trim that
// matches the production path construction) and removes stale numeric dirs.
func TestRunnerOnIdleIntegratesCleanup(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
root := t.TempDir()
stale := filepath.Join(root, "1234")
require.NoError(t, os.MkdirAll(stale, 0o700))
require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
WorkdirParent: root, // leading slash absent, OnIdle reattaches it
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
r.OnIdle(context.Background())
assert.NoDirExists(t, stale)
}
// TestRunnerOnIdleSkipsWhenAlreadyCancelled verifies a pre-cancelled ctx
// short-circuits cleanup before any directory entry is touched.
func TestRunnerOnIdleSkipsWhenAlreadyCancelled(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
root := t.TempDir()
stale := filepath.Join(root, "1234")
require.NoError(t, os.MkdirAll(stale, 0o700))
require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
WorkdirParent: root,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
r.OnIdle(ctx)
assert.DirExists(t, stale)
}

View File

@@ -40,6 +40,12 @@ runner:
# The runner uses exponential backoff when idle, increasing the interval up to this maximum.
# Set to 0 or same as fetch_interval to disable backoff.
fetch_interval_max: 5s
# While idle, remove stale bind-workdir task directories older than this duration.
# Setting either workdir_cleanup_age or idle_cleanup_interval to 0 (or any
# non-positive value) disables workdir cleanup entirely.
workdir_cleanup_age: 24h
# Cadence for the idle stale bind-workdir cleanup pass.
idle_cleanup_interval: 10m
# The base interval for periodic log flush to the Gitea instance.
# Logs may be sent earlier if the buffer reaches log_report_batch_size
# or if log_report_max_latency expires after the first buffered row.
@@ -107,6 +113,7 @@ container:
# If the path starts with '/', the '/' will be trimmed.
# For example, if the parent directory is /path/to/my/dir, workdir_parent should be path/to/my/dir
# If it's empty, /workspace will be used.
# Purely numeric subdirectories under this path are reserved for task workspaces and may be removed by idle cleanup.
workdir_parent:
# Volumes (including bind mounts) can be mounted to containers. Glob syntax is supported, see https://github.com/gobwas/glob
# You can specify multiple volumes. If the sequence is empty, no volumes can be mounted.

View File

@@ -33,6 +33,8 @@ type Runner struct {
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle.
WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories older than this duration during idle cleanup.
IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale bind-workdir cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush.
LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
@@ -92,6 +94,7 @@ type Config struct {
// If file is not empty, it will be used to load the configuration.
func LoadDefault(file string) (*Config, error) {
cfg := &Config{}
definedRunnerKeys := map[string]bool{}
if file != "" {
content, err := os.ReadFile(file)
if err != nil {
@@ -100,6 +103,10 @@ func LoadDefault(file string) (*Config, error) {
if err := yaml.Unmarshal(content, cfg); err != nil {
return nil, fmt.Errorf("parse config file %q: %w", file, err)
}
definedRunnerKeys, err = definedRunnerConfigKeys(content)
if err != nil {
return nil, fmt.Errorf("parse config file %q for defaults metadata: %w", file, err)
}
}
compatibleWithOldEnvs(file != "", cfg)
@@ -157,6 +164,12 @@ func LoadDefault(file string) (*Config, error) {
if cfg.Runner.FetchIntervalMax <= 0 {
cfg.Runner.FetchIntervalMax = 5 * time.Second
}
if cfg.Runner.WorkdirCleanupAge == 0 && !definedRunnerKeys["workdir_cleanup_age"] {
cfg.Runner.WorkdirCleanupAge = 24 * time.Hour
}
if cfg.Runner.IdleCleanupInterval == 0 && !definedRunnerKeys["idle_cleanup_interval"] {
cfg.Runner.IdleCleanupInterval = 10 * time.Minute
}
if cfg.Runner.LogReportInterval <= 0 {
cfg.Runner.LogReportInterval = 5 * time.Second
}
@@ -199,3 +212,30 @@ func LoadDefault(file string) (*Config, error) {
return cfg, nil
}
func definedRunnerConfigKeys(content []byte) (map[string]bool, error) {
var root yaml.Node
if err := yaml.Unmarshal(content, &root); err != nil {
return nil, err
}
defined := map[string]bool{}
if len(root.Content) == 0 {
return defined, nil
}
doc := root.Content[0]
for i := 0; i+1 < len(doc.Content); i += 2 {
key := doc.Content[i]
value := doc.Content[i+1]
if key.Value != "runner" || value.Kind != yaml.MappingNode {
continue
}
for j := 0; j+1 < len(value.Content); j += 2 {
defined[value.Content[j].Value] = true
}
break
}
return defined, nil
}

View File

@@ -7,6 +7,7 @@ import (
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -39,3 +40,80 @@ cache:
_, err := LoadDefault(path)
require.NoError(t, err)
}
func TestLoadDefault_DefaultsWorkdirCleanupAge(t *testing.T) {
cfg, err := LoadDefault("")
require.NoError(t, err)
assert.Equal(t, 24*time.Hour, cfg.Runner.WorkdirCleanupAge)
assert.Equal(t, 10*time.Minute, cfg.Runner.IdleCleanupInterval)
}
func TestLoadDefault_UsesConfiguredWorkdirCleanupAge(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
workdir_cleanup_age: 2h30m
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, 150*time.Minute, cfg.Runner.WorkdirCleanupAge)
}
func TestLoadDefault_UsesConfiguredIdleCleanupInterval(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
idle_cleanup_interval: 45m
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, 45*time.Minute, cfg.Runner.IdleCleanupInterval)
}
func TestLoadDefault_AllowsDisablingWorkdirCleanup(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
workdir_cleanup_age: 0s
idle_cleanup_interval: 0s
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, time.Duration(0), cfg.Runner.WorkdirCleanupAge)
assert.Equal(t, time.Duration(0), cfg.Runner.IdleCleanupInterval)
}
func TestLoadDefault_AllowsNegativeWorkdirCleanupValues(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
workdir_cleanup_age: -1s
idle_cleanup_interval: -1s
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, -1*time.Second, cfg.Runner.WorkdirCleanupAge)
assert.Equal(t, -1*time.Second, cfg.Runner.IdleCleanupInterval)
}
// TestLoadDefault_MalformedYAMLReturnsParseError pins the error surfaced for
// invalid YAML to the canonical "parse config file" message rather than the
// "for defaults metadata" variant — i.e. the main yaml.Unmarshal runs first.
func TestLoadDefault_MalformedYAMLReturnsParseError(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte("runner:\n capacity: [unterminated\n"), 0o600))
_, err := LoadDefault(path)
require.Error(t, err)
assert.Contains(t, err.Error(), "parse config file")
assert.NotContains(t, err.Error(), "defaults metadata")
}