diff --git a/act/runner/cancellation_test.go b/act/runner/cancellation_test.go new file mode 100644 index 00000000..ec2d7afe --- /dev/null +++ b/act/runner/cancellation_test.go @@ -0,0 +1,285 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package runner + +import ( + "context" + "testing" + "time" + + "gitea.com/gitea/runner/act/common" + "gitea.com/gitea/runner/act/exprparser" + "gitea.com/gitea/runner/act/model" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" +) + +// TestCancelledJobStatusEnablesAlwaysAndCancelledSteps verifies that once a job is +// cancelled, getJobContext reports the "cancelled" status so the step `if` functions +// evaluate the way GitHub Actions does: cancelled()/always() are true, success()/failure() +// are false. A step that defaults to success() is therefore skipped while an always() step +// still runs. Before the fix the status could only ever be success/failure, so cancelled() +// was structurally impossible and cancel-only cleanup steps never ran. +func TestCancelledJobStatusEnablesAlwaysAndCancelledSteps(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + rc.markCancelled() + + // The core fix: the job status context now reports "cancelled" instead of being + // pinned to success/failure. + jobCtx := rc.getJobContext() + require.Equal(t, "cancelled", jobCtx.Status) + + // Feed that status through the step-context expression functions, which is what a + // step `if` evaluates. On a cancelled job only always()/cancelled() are true. + interp := exprparser.NewInterpeter( + &exprparser.EvaluationEnvironment{Job: jobCtx}, + exprparser.Config{Context: "step"}, + ) + for expr, want := range map[string]bool{ + "cancelled()": true, + "always()": true, + "success()": false, + "failure()": false, + "!cancelled()": false, + } { + got, err := interp.Evaluate(expr, exprparser.DefaultStatusCheckNone) + require.NoErrorf(t, err, "Evaluate(%q)", expr) + assert.Equalf(t, want, got, "Evaluate(%q) on a cancelled job", expr) + } + + // A step without an `if` defaults to success() and must be skipped on cancel, + // while an `if: always()` step must still run. + disabled, err := interp.Evaluate("", exprparser.DefaultStatusCheckSuccess) + require.NoError(t, err) + assert.Equal(t, false, disabled, "default-success step must be skipped on a cancelled job") + + enabled, err := interp.Evaluate("always()", exprparser.DefaultStatusCheckSuccess) + require.NoError(t, err) + assert.Equal(t, true, enabled, "`if: always()` step must run on a cancelled job") +} + +// TestMainStepsExecutorRunsAlwaysStepsAfterCancel verifies that newMainStepsExecutor does +// not abandon the remaining steps when the run is cancelled mid-pipeline. The later step +// still runs (so a main-stage always() step is reached), it runs under a fresh, +// non-cancelled context, and the job is marked cancelled. The interrupt error is still +// propagated so callers up the chain see the cancellation. +func TestMainStepsExecutorRunsAlwaysStepsAfterCancel(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var ran []string + var laterStepCtxErr error + steps := []common.Executor{ + func(_ context.Context) error { + ran = append(ran, "step1") + cancel() // server cancellation lands while step1 runs + return nil + }, + func(c context.Context) error { + ran = append(ran, "always-step") + laterStepCtxErr = c.Err() + return nil + }, + } + + err := newMainStepsExecutor(rc, steps)(ctx) + + require.ErrorIs(t, err, context.Canceled, "interrupt error is propagated") + assert.Equal(t, []string{"step1", "always-step"}, ran, "the always() step still runs after cancel") + require.NoError(t, laterStepCtxErr, "remaining steps run under a fresh, non-cancelled context") + assert.True(t, rc.jobCancelled, "the job is marked cancelled") +} + +// TestMainStepsExecutorMarksFailedOnTimeoutBetweenSteps guards the timeout path's symmetry with the cancel path. +// When the job deadline (timeout-minutes) lands in the gap between two steps, the job must be marked as failed (not cancelled), +// so always()/failure() cleanup steps run while default success() steps skip, and so the timed-out job is not reported as success. +func TestMainStepsExecutorMarksFailedOnTimeoutBetweenSteps(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + + // A short deadline that we let elapse between steps, so no step records the error itself. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) + defer cancel() + + var ran []string + var laterStepCtxErr error + steps := []common.Executor{ + func(c context.Context) error { + ran = append(ran, "step1") + // Block until the job deadline elapses, then return cleanly: the interrupt lands in the loop's between-steps check, not inside a step. + <-c.Done() + return nil + }, + func(c context.Context) error { + ran = append(ran, "always-step") + laterStepCtxErr = c.Err() + return nil + }, + } + + err := newMainStepsExecutor(rc, steps)(ctx) + + require.ErrorIs(t, err, context.DeadlineExceeded, "the timeout error is propagated") + assert.Equal(t, []string{"step1", "always-step"}, ran, "the always() step still runs after a timeout") + require.NoError(t, laterStepCtxErr, "remaining steps run under a fresh, non-expired context") + assert.True(t, rc.jobFailed, "a job timeout marks the job failed") + assert.False(t, rc.jobCancelled, "a timeout is not a cancellation") + + // The status the real main-step `if` evaluation sees: "failure", so default success() steps skip while always()/failure() steps run. + assert.Equal(t, "failure", rc.getJobContext().Status) +} + +// TestStepsExecutorRunsMainStepsAfterPreCancel verifies that a cancellation landing during the +// pre phase does not abandon the main steps: newStepsExecutor still runs the main-steps executor, +// so a main-stage always()/cancelled() step is reached (under a fresh, non-cancelled context), +// the job is marked cancelled, and the cancellation is propagated. Before the fix the `.Then(...)` +// short-circuit skipped the main steps entirely when a pre step was cancelled. +func TestStepsExecutorRunsMainStepsAfterPreCancel(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var ran []string + var mainStepCtxErr error + preSteps := []common.Executor{ + func(_ context.Context) error { + ran = append(ran, "pre1") + cancel() // server cancellation lands during the pre phase + return nil + }, + } + steps := []common.Executor{ + func(c context.Context) error { + ran = append(ran, "always-step") + mainStepCtxErr = c.Err() + return nil + }, + } + + err := newStepsExecutor(rc, preSteps, steps)(ctx) + + require.ErrorIs(t, err, context.Canceled, "the cancellation is propagated") + assert.Equal(t, []string{"pre1", "always-step"}, ran, "the main always() step runs after a pre-phase cancel") + require.NoError(t, mainStepCtxErr, "the main step runs under a fresh, non-cancelled context") + assert.True(t, rc.jobCancelled, "the job is marked cancelled") +} + +// TestStepsExecutorRunsMainStepsAfterPreFailure verifies that a failing pre step does not abandon +// the main steps: they still run (so a main-stage always()/failure() step is reached), and the +// pre-step error is propagated so the job is reported as failed. The main steps' own `if` +// evaluation is what skips success()-default steps, so running them here is safe. +func TestStepsExecutorRunsMainStepsAfterPreFailure(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + + var ran []string + preSteps := []common.Executor{ + func(_ context.Context) error { + ran = append(ran, "pre1") + return assert.AnError + }, + } + steps := []common.Executor{ + func(_ context.Context) error { + ran = append(ran, "always-step") + return nil + }, + } + + err := newStepsExecutor(rc, preSteps, steps)(context.Background()) + + require.ErrorIs(t, err, assert.AnError, "the pre-step error is propagated") + assert.Equal(t, []string{"pre1", "always-step"}, ran, "the main always() step runs after a pre-step failure") + assert.False(t, rc.jobCancelled, "a pre-step failure is not a cancellation") +} + +// TestPreStepFailureAffectsMainStepIfStatus verifies the status path used by real +// main-step `if` evaluation. A pre-step failure is not present in StepResults, so +// recording only the context job error is not enough: getJobContext must also report +// failure so success()-default main steps skip and failure() steps run. +func TestPreStepFailureAffectsMainStepIfStatus(t *testing.T) { + rc := createIfTestRunContext(map[string]*model.Job{ + "job1": createJob(t, `runs-on: ubuntu-latest`, ""), + }) + ctx := common.WithJobErrorContainer(context.Background()) + + reportStepError(ctx, rc, assert.AnError) + + assert.Equal(t, "failure", rc.getJobContext().Status) + require.ErrorIs(t, common.JobError(ctx), assert.AnError) + + defaultStep := &stepRun{ + RunContext: rc, + Step: &model.Step{ID: "default-step"}, + env: map[string]string{}, + } + defaultEnabled, err := isStepEnabled(ctx, defaultStep.getIfExpression(ctx, stepStageMain), defaultStep, stepStageMain) + require.NoError(t, err) + assert.False(t, defaultEnabled, "default success() main step must skip after a pre-step failure") + + failureStep := &stepRun{ + RunContext: rc, + Step: &model.Step{ + ID: "failure-step", + If: yaml.Node{Value: "failure()"}, + }, + env: map[string]string{}, + } + failureEnabled, err := isStepEnabled(ctx, failureStep.getIfExpression(ctx, stepStageMain), failureStep, stepStageMain) + require.NoError(t, err) + assert.True(t, failureEnabled, "failure() main step must run after a pre-step failure") +} + +// TestPostStepsContextCancelledIsUsableForFailingStep guards against a panic: post/cleanup +// steps run on a context derived from the cancelled job context, and a failing post step +// records its error via common.SetJobError. If that derived context lacks a job-error container, +// SetJobError dereferences a nil map and panics. The post context must therefore be detached +// from cancellation (so the steps run) yet still carry a usable error container. +func TestPostStepsContextCancelledIsUsableForFailingStep(t *testing.T) { + cancelled, cancel := context.WithCancel(common.WithJobErrorContainer(context.Background())) + cancel() + require.ErrorIs(t, cancelled.Err(), context.Canceled) + + postCtx, done := postStepsContext(cancelled) + defer done() + + // Detached from cancellation, so the post steps actually run. + require.NoError(t, postCtx.Err(), "post context must not be cancelled") + + // A failing post step records its error instead of panicking. + require.NotPanics(t, func() { + common.SetJobError(postCtx, assert.AnError) + }, "a failing post step must not panic on the cancel path") + assert.ErrorIs(t, common.JobError(postCtx), assert.AnError) +} + +// TestPostStepsContextDeadlinePreservesJobError verifies the job-timeout path keeps the original +// job-error container (via context.WithoutCancel), so the timeout failure and any post-step error +// survive into the post phase and the job is still reported as failed. +func TestPostStepsContextDeadlinePreservesJobError(t *testing.T) { + base := common.WithJobErrorContainer(context.Background()) + common.SetJobError(base, assert.AnError) + expired, cancel := context.WithDeadline(base, time.Now().Add(-time.Hour)) + defer cancel() + require.ErrorIs(t, expired.Err(), context.DeadlineExceeded) + + postCtx, done := postStepsContext(expired) + defer done() + + require.NoError(t, postCtx.Err(), "post context must not carry the expired deadline") + assert.ErrorIs(t, common.JobError(postCtx), assert.AnError, "the timeout job error must be preserved") +} diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go index 752f74a7..abd92346 100644 --- a/act/runner/job_executor.go +++ b/act/runner/job_executor.go @@ -58,9 +58,10 @@ type jobInfo interface { // reportStepError emits the GitHub Actions ##[error] annotation and records // the error against the job so the job is reported as failed. -func reportStepError(ctx context.Context, err error) { +func reportStepError(ctx context.Context, rc *RunContext, err error) { common.Logger(ctx).Errorf("##[error]%v", err) common.SetJobError(ctx, err) + rc.markFailed() } func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executor { @@ -118,9 +119,9 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo rc.CurrentStepIndex = stepIdx preErr := preExec(ctx) if preErr != nil { - reportStepError(ctx, preErr) + reportStepError(ctx, rc, preErr) } else if ctx.Err() != nil { - reportStepError(ctx, ctx.Err()) + reportStepError(ctx, rc, ctx.Err()) } return preErr })) @@ -130,9 +131,9 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo rc.CurrentStepIndex = stepIdx err := stepExec(ctx) if err != nil { - reportStepError(ctx, err) + reportStepError(ctx, rc, err) } else if ctx.Err() != nil { - reportStepError(ctx, ctx.Err()) + reportStepError(ctx, rc, ctx.Err()) } return nil })) @@ -142,9 +143,9 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo rc.CurrentStepIndex = stepIdx err := postFn(ctx) if err != nil { - reportStepError(ctx, err) + reportStepError(ctx, rc, err) } else if ctx.Err() != nil { - reportStepError(ctx, ctx.Err()) + reportStepError(ctx, rc, ctx.Err()) } return err }) @@ -159,7 +160,12 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo postExecutor = postExecutor.Finally(func(ctx context.Context) error { jobError := common.JobError(ctx) var err error - if rc.Config.AutoRemove || jobError == nil { + // jobError == nil keeps a failed job's container alive for post-mortem debugging when + // AutoRemove is off (the act-CLI --rm behavior; the shipped runner always sets + // AutoRemove). A cancelled run is not a failure to inspect, and the cancel-path post + // context now carries its own error container so a failing post step makes jobError + // non-nil — OR in rc.jobCancelled so cancellation still always tears the container down. + if rc.Config.AutoRemove || jobError == nil || rc.jobCancelled { // always allow 1 min for stopping and removing the runner, even if we were cancelled ctx, cancel := context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), time.Minute) defer cancel() @@ -198,35 +204,107 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo return err }) - pipeline := make([]common.Executor, 0) - pipeline = append(pipeline, preSteps...) - pipeline = append(pipeline, steps...) + stepsExecutor := newStepsExecutor(rc, preSteps, steps) - return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...). + return common.NewPipelineExecutor(info.startContainer(), stepsExecutor. Finally(func(ctx context.Context) error { - var cancel context.CancelFunc - switch ctx.Err() { - case context.Canceled: - // in case of an aborted run, we still should execute the - // post steps to allow cleanup. - ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute) - defer cancel() - case context.DeadlineExceeded: - // The job hit its timeout-minutes. Without a fresh context the post - // steps would run against the already-expired context and be skipped, - // so cleanup post-hooks (e.g. actions/checkout post, cache save) would - // not run. Derive the context with WithoutCancel so the new deadline - // applies but the job error state is preserved: the job is still - // reported as failed and container teardown matches a normal failure. - ctx, cancel = context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) - defer cancel() - } - return postExecutor(ctx) + // Record an interrupt (backstop for interrupts that land outside the main + // step loop) so the post steps observe the cancelled/failed job status. + rc.markInterrupted(ctx.Err()) + postCtx, cancel := postStepsContext(ctx) + defer cancel() + return postExecutor(postCtx) }). Finally(info.interpolateOutputs()). Finally(info.closeContainer())) } +// postStepsContext derives the context used to run the job's post/cleanup steps from the +// finished main-pipeline context. Cleanup has to run even when the run was interrupted, so the +// returned context always carries a fresh bounded deadline and is never itself cancelled. +// +// - context.Canceled (server cancel): detach from the cancelled context via a fresh root so +// the post steps can run. +// - context.DeadlineExceeded (job timeout): detach the deadline with WithoutCancel, which +// keeps the original values — including the job-error container — so the timeout failure and +// any post-step error are preserved and the job is still reported as failed. +// - otherwise: run on the live context unchanged. +func postStepsContext(ctx context.Context) (context.Context, context.CancelFunc) { + switch ctx.Err() { + case context.Canceled: + // The cancelled context is abandoned for a fresh root, which drops the job-error + // container installed at the job root. Re-attach a fresh one so a failing post step + // records its error via SetJobError instead of panicking on a nil container. + return context.WithTimeout(common.WithJobErrorContainer(common.WithLogger(context.Background(), common.Logger(ctx))), 5*time.Minute) + case context.DeadlineExceeded: + return context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) + default: + return ctx, func() {} + } +} + +// newStepsExecutor sequences the job's pre steps and main steps. +// +// The pre steps run as a normal pipeline that short-circuits on the first failure or +// cancellation. The main-steps executor then runs unconditionally — even if a pre step failed +// or the job was interrupted — so always()/cancelled()/failure() main steps still run, mirroring +// GitHub Actions. This is safe because each main step re-evaluates its own `if` (a pre-step +// failure flips the expression job status to failure, so success()-default steps skip) and +// newMainStepsExecutor detaches from an interrupted context before running the remaining steps. +// +// A pre-step failure or interrupt is still propagated so the job is reported with the correct +// conclusion; the pre error takes precedence since it happened first. +func newStepsExecutor(rc *RunContext, preSteps, steps []common.Executor) common.Executor { + preExecutor := common.NewPipelineExecutor(preSteps...) + mainExecutor := newMainStepsExecutor(rc, steps) + return func(ctx context.Context) error { + preErr := preExecutor(ctx) + mainErr := mainExecutor(ctx) + if preErr != nil { + return preErr + } + return mainErr + } +} + +// newMainStepsExecutor runs the job's main-stage step executors in order. Unlike a plain +// pipeline, an interruption (context.Canceled from a server cancel, or context.DeadlineExceeded +// from the job timeout) does not abandon the remaining steps: it marks the job cancelled when +// appropriate and keeps iterating under a fresh, bounded context so steps whose `if` still +// evaluates true — always() and cancelled() — run for cleanup, mirroring GitHub Actions. Steps +// that default to success() skip themselves because success() is false once the job is no longer +// successful. The main-step wrappers report their own errors and return nil, so the loop drives +// step ordering off the context, not return values. +func newMainStepsExecutor(rc *RunContext, steps []common.Executor) common.Executor { + return func(ctx context.Context) error { + for i, step := range steps { + if ctx.Err() != nil { + return runMainStepsAfterInterrupt(ctx, rc, steps[i:]) + } + _ = step(ctx) + } + // An interrupt can land during the final step, after the loop's last context + // check; record it so the post steps still observe the cancelled/failed status. + rc.markInterrupted(ctx.Err()) + return nil + } +} + +// runMainStepsAfterInterrupt runs the remaining main steps after the job context was cancelled or +// timed out. It detaches from the interrupted context (keeping its values: logger and job error) +// and applies a fresh deadline so always()/cancelled() steps run to completion. The original +// interrupt error is returned so callers up the chain still see the job as cancelled/timed out. +func runMainStepsAfterInterrupt(ctx context.Context, rc *RunContext, steps []common.Executor) error { + interruptErr := ctx.Err() + rc.markInterrupted(interruptErr) + freshCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) + defer cancel() + for _, step := range steps { + _ = step(freshCtx) + } + return interruptErr +} + func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success bool) { logger := common.Logger(ctx) diff --git a/act/runner/run_context.go b/act/runner/run_context.go index 3630c329..68998801 100644 --- a/act/runner/run_context.go +++ b/act/runner/run_context.go @@ -73,6 +73,39 @@ type RunContext struct { // captured before execution so each matrix combo interpolates from the originals rather // than from a sibling's already-resolved values written into the shared Job.Outputs. outputTemplate map[string]string + // jobCancelled records that this job's run was cancelled (context.Canceled). It makes + // getJobContext report the "cancelled" status so cancelled()/always() evaluate the way + // GitHub Actions does, letting cleanup and always() steps run while normal steps skip. + jobCancelled bool + // jobFailed records failures outside normal main-step results, such as action pre-step + // failures. Those failures must still make success() false and failure() true for later + // main-step if evaluation. + jobFailed bool +} + +// markCancelled flags the job as cancelled so subsequent step `if` evaluations and the +// job status context observe the "cancelled" state. +func (rc *RunContext) markCancelled() { + rc.jobCancelled = true +} + +// markFailed flags the job as failed so subsequent step `if` evaluations observe +// failure even when the error happened outside a main step result. +func (rc *RunContext) markFailed() { + rc.jobFailed = true +} + +// markInterrupted records the job's interruption status from a context error so later step `if` evaluations and the job result observe it, +// keeping the timeout path symmetric with the cancel path: +// - context.Canceled (server cancel) marks the job cancelled, matching GitHub's "only always()/cancelled() run on cancel". +// - context.DeadlineExceeded (job timeout-minutes) marks the job failed, matching the "Timeout -> FAILURE" reporting semantics. +func (rc *RunContext) markInterrupted(err error) { + switch { + case errors.Is(err, context.Canceled): + rc.markCancelled() + case errors.Is(err, context.DeadlineExceeded): + rc.markFailed() + } } func (rc *RunContext) AddMask(mask string) { @@ -904,12 +937,21 @@ func trimToLen(s string, l int) string { func (rc *RunContext) getJobContext() *model.JobContext { jobStatus := "success" + if rc.jobFailed { + jobStatus = "failure" + } for _, stepStatus := range rc.StepResults { if stepStatus.Conclusion == model.StepStatusFailure { jobStatus = "failure" break } } + // A cancelled run takes precedence over success/failure so cancelled() is true and + // success()/failure() are false, matching GitHub Actions: on cancellation only + // always() and cancelled() steps run. + if rc.jobCancelled { + jobStatus = "cancelled" + } return &model.JobContext{ Status: jobStatus, }