mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-03-19 19:36:33 +08:00
fix: race condition in reporter between RunDaemon and Close (#796)
## Summary - Fix data race on `r.closed` between `RunDaemon()` and `Close()` by protecting it with the existing `stateMu` — `closed` is part of the reporter state. `RunDaemon()` reads it under `stateMu.RLock()`, `Close()` sets it inside the existing `stateMu.Lock()` block - `ReportState` now has a parameter to not report results from runDaemon even if set, from now on `Close` reports the result - `Close` waits for `RunDaemon()` to signal exit via a closed channel `daemon` before reporting the final logs and state with result, unless something really wrong happens it does not time out - Add `TestReporter_EphemeralRunnerDeletion` which reproduces the exact scenario from #793: RunDaemon's `ReportState` racing with `Close`, causing the ephemeral runner to be deleted before final logs are sent - Add `TestReporter_RunDaemonClose_Race` which exercises `RunDaemon()` and `Close()` concurrently to verify no data race on `r.closed` under `go test -race` - Enable `-race` flag in `make test` so CI catches data races going forward Based on #794, with fixes for the remaining unprotected `r.closed` reads that the race detector catches. Fixes #793 --------- Co-authored-by: Christopher Homberger <christopher.homberger@web.de> Co-authored-by: ChristopherHX <christopher.homberger@web.de> Co-authored-by: rmawatson <rmawatson@hotmail.com> Reviewed-on: https://gitea.com/gitea/act_runner/pulls/796 Reviewed-by: ChristopherHX <christopherhx@noreply.gitea.com> Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com> Co-authored-by: silverwind <me@silverwind.io> Co-committed-by: silverwind <me@silverwind.io>
This commit is contained in:
committed by
ChristopherHX
parent
f4418eff18
commit
ae6e5dfcf7
@@ -5,6 +5,7 @@ package report
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
@@ -37,6 +38,7 @@ type Reporter struct {
|
||||
state *runnerv1.TaskState
|
||||
stateMu sync.RWMutex
|
||||
outputs sync.Map
|
||||
daemon chan struct{}
|
||||
|
||||
debugOutputEnabled bool
|
||||
stopCommandEndToken string
|
||||
@@ -63,6 +65,7 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C
|
||||
state: &runnerv1.TaskState{
|
||||
Id: task.Id,
|
||||
},
|
||||
daemon: make(chan struct{}),
|
||||
}
|
||||
|
||||
if task.Secrets["ACTIONS_STEP_DEBUG"] == "true" {
|
||||
@@ -109,6 +112,7 @@ func (r *Reporter) Fire(entry *log.Entry) error {
|
||||
if stage != "Main" {
|
||||
if v, ok := entry.Data["jobResult"]; ok {
|
||||
if jobResult, ok := r.parseResult(v); ok {
|
||||
// We need to ensure log upload before this upload
|
||||
r.state.Result = jobResult
|
||||
r.state.StoppedAt = timestamppb.New(timestamp)
|
||||
for _, s := range r.state.Steps {
|
||||
@@ -176,15 +180,17 @@ func (r *Reporter) Fire(entry *log.Entry) error {
|
||||
}
|
||||
|
||||
func (r *Reporter) RunDaemon() {
|
||||
if r.closed {
|
||||
return
|
||||
}
|
||||
if r.ctx.Err() != nil {
|
||||
r.stateMu.RLock()
|
||||
closed := r.closed
|
||||
r.stateMu.RUnlock()
|
||||
if closed || r.ctx.Err() != nil {
|
||||
// Acknowledge close
|
||||
close(r.daemon)
|
||||
return
|
||||
}
|
||||
|
||||
_ = r.ReportLog(false)
|
||||
_ = r.ReportState()
|
||||
_ = r.ReportState(false)
|
||||
|
||||
time.AfterFunc(time.Second, r.RunDaemon)
|
||||
}
|
||||
@@ -226,9 +232,8 @@ func (r *Reporter) SetOutputs(outputs map[string]string) {
|
||||
}
|
||||
|
||||
func (r *Reporter) Close(lastWords string) error {
|
||||
r.closed = true
|
||||
|
||||
r.stateMu.Lock()
|
||||
r.closed = true
|
||||
if r.state.Result == runnerv1.Result_RESULT_UNSPECIFIED {
|
||||
if lastWords == "" {
|
||||
lastWords = "Early termination"
|
||||
@@ -251,13 +256,23 @@ func (r *Reporter) Close(lastWords string) error {
|
||||
})
|
||||
}
|
||||
r.stateMu.Unlock()
|
||||
// Wait for Acknowledge
|
||||
select {
|
||||
case <-r.daemon:
|
||||
case <-time.After(60 * time.Second):
|
||||
close(r.daemon)
|
||||
log.Error("No Response from RunDaemon for 60s, continue best effort")
|
||||
}
|
||||
|
||||
return retry.Do(func() error {
|
||||
if err := r.ReportLog(true); err != nil {
|
||||
return err
|
||||
}
|
||||
return r.ReportState()
|
||||
}, retry.Context(r.ctx))
|
||||
// Report the job outcome even when all log upload retry attempts have been exhausted
|
||||
return errors.Join(
|
||||
retry.Do(func() error {
|
||||
return r.ReportLog(true)
|
||||
}, retry.Context(r.ctx)),
|
||||
retry.Do(func() error {
|
||||
return r.ReportState(true)
|
||||
}, retry.Context(r.ctx)),
|
||||
)
|
||||
}
|
||||
|
||||
func (r *Reporter) ReportLog(noMore bool) error {
|
||||
@@ -285,17 +300,20 @@ func (r *Reporter) ReportLog(noMore bool) error {
|
||||
|
||||
r.stateMu.Lock()
|
||||
r.logRows = r.logRows[ack-r.logOffset:]
|
||||
submitted := r.logOffset + len(rows)
|
||||
r.logOffset = ack
|
||||
r.stateMu.Unlock()
|
||||
|
||||
if noMore && ack < r.logOffset+len(rows) {
|
||||
if noMore && ack < submitted {
|
||||
return fmt.Errorf("not all logs are submitted")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Reporter) ReportState() error {
|
||||
// ReportState only reports the job result if reportResult is true
|
||||
// RunDaemon never reports results even if result is set
|
||||
func (r *Reporter) ReportState(reportResult bool) error {
|
||||
r.clientM.Lock()
|
||||
defer r.clientM.Unlock()
|
||||
|
||||
@@ -303,6 +321,11 @@ func (r *Reporter) ReportState() error {
|
||||
state := proto.Clone(r.state).(*runnerv1.TaskState)
|
||||
r.stateMu.RUnlock()
|
||||
|
||||
// Only report result from Close to reliable sent logs
|
||||
if !reportResult {
|
||||
state.Result = runnerv1.Result_RESULT_UNSPECIFIED
|
||||
}
|
||||
|
||||
outputs := make(map[string]string)
|
||||
r.outputs.Range(func(k, v interface{}) bool {
|
||||
if val, ok := v.(string); ok {
|
||||
|
||||
Reference in New Issue
Block a user