chore(deps): upgrade golangci-lint from v2.10.1 to v2.11.4 (#821 )

## Summary - Bump golangci-lint from v2.10.1 to v2.11.4 - Remove unused `//nolint:revive` directive on metrics package declaration (detected by stricter nolintlint in new version) ## Changes between v2.10.1 and v2.11.4 - **v2.11.0** — Multiple linter dependency upgrades, Go 1.26 support - **v2.11.2** — Bug fix for `fmt` with path - **v2.11.3** — gosec update - **v2.11.4** — Dependency updates (sqlclosecheck, noctx, etc.) No breaking changes. Reviewed-on: https://gitea.com/gitea/act_runner/pulls/821 Co-authored-by: Bo-Yi Wu <appleboy.tw@gmail.com> Co-committed-by: Bo-Yi Wu <appleboy.tw@gmail.com>
feat: add Prometheus metrics endpoint for runner observability (#820 )
2026-04-24 12:50:31 +08:00 · 2026-04-15 03:56:34 +00:00 · 2026-04-15 01:27:34 +00:00 · 2026-04-14 11:29:25 +00:00 · 2026-03-28 16:18:47 +00:00 · 2026-03-26 20:07:22 +00:00
15 changed files with 1145 additions and 130 deletions
--- a/.gitea/workflows/release-tag.yml
+++ b/.gitea/workflows/release-tag.yml
@@ -39,6 +39,15 @@ jobs:
          GPG_FINGERPRINT: ${{ steps.import_gpg.outputs.fingerprint }}
  release-image:
    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        variant:
+          - target: basic
+            tag_suffix: ""
+          - target: dind
+            tag_suffix: "-dind"
+          - target: dind-rootless
+            tag_suffix: "-dind-rootless"
    container:
      image: catthehacker/ubuntu:act-latest
    env:
@@ -62,50 +71,33 @@ jobs:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}

-      - name: Get Meta
-        id: meta
+      - name: Repo Meta
+        id: repo_meta
        run: |
          echo REPO_NAME=$(echo ${GITHUB_REPOSITORY} | awk -F"/" '{print $2}') >> $GITHUB_OUTPUT
-          echo REPO_VERSION=${GITHUB_REF_NAME#v} >> $GITHUB_OUTPUT
+
+      - name: "Docker meta"
+        id: docker_meta
+        uses: https://github.com/docker/metadata-action@v5
+        with:
+          images: |
+            ${{ env.DOCKER_ORG }}/${{ steps.repo_meta.outputs.REPO_NAME }}
+          tags: |
+            type=semver,pattern={{major}}.{{minor}}.{{patch}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+          flavor: |
+            latest=true
+            suffix=${{ matrix.variant.tag_suffix }},onlatest=true

      - name: Build and push
        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
-          target: basic
+          target: ${{ matrix.variant.target }}
          platforms: |
            linux/amd64
            linux/arm64
          push: true
-          tags: |
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ steps.meta.outputs.REPO_VERSION }}
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ env.DOCKER_LATEST }}
-
-      - name: Build and push dind
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          target: dind
-          platforms: |
-            linux/amd64
-            linux/arm64
-          push: true
-          tags: |
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ steps.meta.outputs.REPO_VERSION }}-dind
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ env.DOCKER_LATEST }}-dind
-
-      - name: Build and push dind-rootless
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          target: dind-rootless
-          platforms: |
-            linux/amd64
-            linux/arm64
-          push: true
-          tags: |
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ steps.meta.outputs.REPO_VERSION }}-dind-rootless
-            ${{ env.DOCKER_ORG }}/${{ steps.meta.outputs.REPO_NAME }}:${{ env.DOCKER_LATEST }}-dind-rootless
+          tags: ${{ steps.docker_meta.outputs.tags }}
--- a/2
+++ b/2
@@ -20,7 +20,7 @@ DOCKER_TAG ?= nightly
 DOCKER_REF := $(DOCKER_IMAGE):$(DOCKER_TAG)
 DOCKER_ROOTLESS_REF := $(DOCKER_IMAGE):$(DOCKER_TAG)-dind-rootless

-GOLANGCI_LINT_PACKAGE ?= github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.10.1
+GOLANGCI_LINT_PACKAGE ?= github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.4
 GOVULNCHECK_PACKAGE ?= golang.org/x/vuln/cmd/govulncheck@v1

 ifneq ($(shell uname), Darwin)
--- a/go.mod
+++ b/go.mod
@@ -14,14 +14,15 @@ require (
 	github.com/sirupsen/logrus v1.9.4
 	github.com/spf13/cobra v1.10.2
 	github.com/stretchr/testify v1.11.1
+	go.yaml.in/yaml/v4 v4.0.0-rc.3
 	golang.org/x/term v0.40.0
-	golang.org/x/time v0.14.0
+	golang.org/x/time v0.14.0 // indirect
 	google.golang.org/protobuf v1.36.11
 	gopkg.in/yaml.v3 v3.0.1
 	gotest.tools/v3 v3.5.2
 )

-require go.yaml.in/yaml/v4 v4.0.0-rc.3
+require github.com/prometheus/client_golang v1.23.2

 require (
 	cyphar.com/go-pathrs v0.2.3 // indirect
@@ -30,6 +31,7 @@ require (
 	github.com/Masterminds/semver v1.5.0 // indirect
 	github.com/Microsoft/go-winio v0.6.2 // indirect
 	github.com/ProtonMail/go-crypto v1.3.0 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/bmatcuk/doublestar/v4 v4.10.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
@@ -76,12 +78,16 @@ require (
 	github.com/moby/sys/user v0.4.0 // indirect
 	github.com/moby/sys/userns v0.1.0 // indirect
 	github.com/moby/term v0.5.2 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
 	github.com/opencontainers/selinux v1.13.1 // indirect
 	github.com/pjbgf/sha1cd v0.5.0 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
+	github.com/prometheus/common v0.66.1 // indirect
+	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/rhysd/actionlint v1.7.11 // indirect
 	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sergi/go-diff v1.4.0 // indirect
@@ -99,6 +105,7 @@ require (
 	go.opentelemetry.io/otel v1.40.0 // indirect
 	go.opentelemetry.io/otel/metric v1.40.0 // indirect
 	go.opentelemetry.io/otel/trace v1.40.0 // indirect
+	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	golang.org/x/crypto v0.48.0 // indirect
 	golang.org/x/net v0.50.0 // indirect
 	golang.org/x/sync v0.19.0 // indirect
@@ -110,7 +117,7 @@ require (
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 )

-replace github.com/nektos/act => gitea.com/gitea/act v0.261.8
+replace github.com/nektos/act => gitea.com/gitea/act v0.261.10

 // Remove after github.com/docker/distribution is updated to support distribution/reference v0.6.0
 // (pulled in via moby/buildkit, breaks on undefined: reference.SplitHostname)
--- a/go.sum
+++ b/go.sum
@@ -8,8 +8,8 @@ cyphar.com/go-pathrs v0.2.3 h1:0pH8gep37wB0BgaXrEaN1OtZhUMeS7VvaejSr6i822o=
 cyphar.com/go-pathrs v0.2.3/go.mod h1:y8f1EMG7r+hCuFf/rXsKqMJrJAUoADZGNh5/vZPKcGc=
 dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
 dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
-gitea.com/gitea/act v0.261.8 h1:rUWB5GOZOubfe2VteKb7XP3HRIbcW3UUmfh7bVAgQcA=
-gitea.com/gitea/act v0.261.8/go.mod h1:lTp4136rwbZiZS3ZVQeHCvd4qRAZ7LYeiRBqOSdMY/4=
+gitea.com/gitea/act v0.261.10 h1:ndwbtuMXXz1dpYF2iwY1/PkgKNETo4jmPXfinTZt8cs=
+gitea.com/gitea/act v0.261.10/go.mod h1:oIkqQHvU0lfuIWwcpqa4FmU+t3prA89tgkuHUTsrI2c=
 github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
 github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
 github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
@@ -29,6 +29,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
 github.com/avast/retry-go/v4 v4.7.0 h1:yjDs35SlGvKwRNSykujfjdMxMhMQQM0TnIjJaHB+Zio=
 github.com/avast/retry-go/v4 v4.7.0/go.mod h1:ZMPDa3sY2bKgpLtap9JRUgk2yTAba7cgiFhqxY2Sg6Q=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bmatcuk/doublestar/v4 v4.10.0 h1:zU9WiOla1YA122oLM6i4EXvGW62DvKZVxIe6TYWexEs=
 github.com/bmatcuk/doublestar/v4 v4.10.0/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
 github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM=
@@ -129,6 +131,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
 github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
@@ -153,6 +157,8 @@ github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
 github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
 github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
 github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=
 github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
@@ -167,6 +173,14 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
+github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
+github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
+github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
+github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/rhysd/actionlint v1.7.11 h1:m+aSuCpCIClS8X02xMG4Z8s87fCHPsAtYkAoWGQZgEE=
 github.com/rhysd/actionlint v1.7.11/go.mod h1:8n50YougV9+50niD7oxgDTZ1KbN/ZnKiQ2xpLFeVhsI=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
@@ -237,6 +251,10 @@ go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZY
 go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA=
 go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I=
 go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
+go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 go.yaml.in/yaml/v4 v4.0.0-rc.3 h1:3h1fjsh1CTAPjW7q/EMe+C8shx5d8ctzZTrLcs/j8Go=
 go.yaml.in/yaml/v4 v4.0.0-rc.3/go.mod h1:aZqd9kCMsGL7AuUv/m/PvWLdg5sjJsZ4oHDEnfPPfY0=
--- a/internal/app/cmd/daemon.go
+++ b/internal/app/cmd/daemon.go
@@ -27,6 +27,7 @@ import (
 	"gitea.com/gitea/act_runner/internal/pkg/config"
 	"gitea.com/gitea/act_runner/internal/pkg/envcheck"
 	"gitea.com/gitea/act_runner/internal/pkg/labels"
+	"gitea.com/gitea/act_runner/internal/pkg/metrics"
 	"gitea.com/gitea/act_runner/internal/pkg/ver"
 )

@@ -149,6 +150,15 @@ func runDaemon(ctx context.Context, daemArgs *daemonArgs, configFile *string) fu
 				resp.Msg.Runner.Name, resp.Msg.Runner.Version, resp.Msg.Runner.Labels)
 		}

+		if cfg.Metrics.Enabled {
+			metrics.Init()
+			metrics.RunnerInfo.WithLabelValues(ver.Version(), resp.Msg.Runner.Name).Set(1)
+			metrics.RunnerCapacity.Set(float64(cfg.Runner.Capacity))
+			metrics.RegisterUptimeFunc(time.Now())
+			metrics.RegisterRunningJobsFunc(runner.RunningCount, cfg.Runner.Capacity)
+			metrics.StartServer(ctx, cfg.Metrics.Addr)
+		}
+
 		poller := poll.New(cfg, cli, runner)

 		if daemArgs.Once || reg.Ephemeral {
--- a/internal/app/poll/poller.go
+++ b/internal/app/poll/poller.go
@@ -7,17 +7,19 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math/rand/v2"
 	"sync"
 	"sync/atomic"
+	"time"

 	runnerv1 "code.gitea.io/actions-proto-go/runner/v1"
 	"connectrpc.com/connect"
 	log "github.com/sirupsen/logrus"
-	"golang.org/x/time/rate"

 	"gitea.com/gitea/act_runner/internal/app/run"
 	"gitea.com/gitea/act_runner/internal/pkg/client"
 	"gitea.com/gitea/act_runner/internal/pkg/config"
+	"gitea.com/gitea/act_runner/internal/pkg/metrics"
 )

 type Poller struct {
@@ -35,6 +37,19 @@ type Poller struct {
 	done chan struct{}
 }

+// workerState holds per-goroutine polling state. Backoff counters are
+// per-worker so that with Capacity > 1, N workers each seeing one empty
+// response don't combine into a "consecutive N empty" reading on a shared
+// counter and trigger an unnecessarily long backoff.
+type workerState struct {
+	consecutiveEmpty  int64
+	consecutiveErrors int64
+	// lastBackoff is the last interval reported to the PollBackoffSeconds gauge
+	// from this worker; used to suppress redundant no-op Set calls when the
+	// backoff plateaus (e.g. at FetchIntervalMax).
+	lastBackoff time.Duration
+}
+
 func New(cfg *config.Config, client client.Client, runner *run.Runner) *Poller {
 	pollingCtx, shutdownPolling := context.WithCancel(context.Background())

@@ -58,11 +73,10 @@ func New(cfg *config.Config, client client.Client, runner *run.Runner) *Poller {
 }

 func (p *Poller) Poll() {
-	limiter := rate.NewLimiter(rate.Every(p.cfg.Runner.FetchInterval), 1)
 	wg := &sync.WaitGroup{}
 	for i := 0; i < p.cfg.Runner.Capacity; i++ {
 		wg.Add(1)
-		go p.poll(wg, limiter)
+		go p.poll(wg)
 	}
 	wg.Wait()

@@ -71,9 +85,7 @@ func (p *Poller) Poll() {
 }

 func (p *Poller) PollOnce() {
-	limiter := rate.NewLimiter(rate.Every(p.cfg.Runner.FetchInterval), 1)
-
-	p.pollOnce(limiter)
+	p.pollOnce(&workerState{})

 	// signal that we're done
 	close(p.done)
@@ -108,10 +120,11 @@ func (p *Poller) Shutdown(ctx context.Context) error {
 	}
 }

-func (p *Poller) poll(wg *sync.WaitGroup, limiter *rate.Limiter) {
+func (p *Poller) poll(wg *sync.WaitGroup) {
 	defer wg.Done()
+	s := &workerState{}
 	for {
-		p.pollOnce(limiter)
+		p.pollOnce(s)

 		select {
 		case <-p.pollingCtx.Done():
@@ -122,19 +135,61 @@ func (p *Poller) poll(wg *sync.WaitGroup, limiter *rate.Limiter) {
 	}
 }

-func (p *Poller) pollOnce(limiter *rate.Limiter) {
+// calculateInterval returns the polling interval with exponential backoff based on
+// consecutive empty or error responses. The interval starts at FetchInterval and
+// doubles with each consecutive empty/error, capped at FetchIntervalMax.
+func (p *Poller) calculateInterval(s *workerState) time.Duration {
+	base := p.cfg.Runner.FetchInterval
+	maxInterval := p.cfg.Runner.FetchIntervalMax
+
+	n := max(s.consecutiveEmpty, s.consecutiveErrors)
+	if n <= 1 {
+		return base
+	}
+
+	// Capped exponential backoff: base * 2^(n-1), max shift=5 so multiplier <= 32
+	shift := min(n-1, 5)
+	interval := base * time.Duration(int64(1)<<shift)
+	return min(interval, maxInterval)
+}
+
+// addJitter adds +/- 20% random jitter to the given duration to avoid thundering herd.
+func addJitter(d time.Duration) time.Duration {
+	if d <= 0 {
+		return d
+	}
+	// jitter range: [-20%, +20%] of d
+	jitterRange := int64(d) * 2 / 5 // 40% total range
+	if jitterRange <= 0 {
+		return d
+	}
+	jitter := rand.Int64N(jitterRange) - jitterRange/2
+	return d + time.Duration(jitter)
+}
+
+func (p *Poller) pollOnce(s *workerState) {
 	for {
-		if err := limiter.Wait(p.pollingCtx); err != nil {
-			if p.pollingCtx.Err() != nil {
-				log.WithError(err).Debug("limiter wait failed")
-			}
-			return
-		}
-		task, ok := p.fetchTask(p.pollingCtx)
+		task, ok := p.fetchTask(p.pollingCtx, s)
 		if !ok {
+			base := p.calculateInterval(s)
+			if base != s.lastBackoff {
+				metrics.PollBackoffSeconds.Set(base.Seconds())
+				s.lastBackoff = base
+			}
+			timer := time.NewTimer(addJitter(base))
+			select {
+			case <-timer.C:
+			case <-p.pollingCtx.Done():
+				timer.Stop()
+				return
+			}
 			continue
 		}

+		// Got a task — reset backoff counters for fast subsequent polling.
+		s.consecutiveEmpty = 0
+		s.consecutiveErrors = 0
+
 		p.runTaskWithRecover(p.jobsCtx, task)
 		return
 	}
@@ -153,24 +208,42 @@ func (p *Poller) runTaskWithRecover(ctx context.Context, task *runnerv1.Task) {
 	}
 }

-func (p *Poller) fetchTask(ctx context.Context) (*runnerv1.Task, bool) {
+func (p *Poller) fetchTask(ctx context.Context, s *workerState) (*runnerv1.Task, bool) {
 	reqCtx, cancel := context.WithTimeout(ctx, p.cfg.Runner.FetchTimeout)
 	defer cancel()

 	// Load the version value that was in the cache when the request was sent.
 	v := p.tasksVersion.Load()
+	start := time.Now()
 	resp, err := p.client.FetchTask(reqCtx, connect.NewRequest(&runnerv1.FetchTaskRequest{
 		TasksVersion: v,
 	}))
+
+	// DeadlineExceeded is the designed idle path for a long-poll: the server
+	// found no work within FetchTimeout. Treat it as an empty response and do
+	// not record the duration — the timeout value would swamp the histogram.
 	if errors.Is(err, context.DeadlineExceeded) {
-		err = nil
+		s.consecutiveEmpty++
+		s.consecutiveErrors = 0 // timeout is a healthy idle response
+		metrics.PollFetchTotal.WithLabelValues(metrics.LabelResultEmpty).Inc()
+		return nil, false
 	}
+	metrics.PollFetchDuration.Observe(time.Since(start).Seconds())
+
 	if err != nil {
 		log.WithError(err).Error("failed to fetch task")
+		s.consecutiveErrors++
+		metrics.PollFetchTotal.WithLabelValues(metrics.LabelResultError).Inc()
+		metrics.ClientErrors.WithLabelValues(metrics.LabelMethodFetchTask).Inc()
 		return nil, false
 	}

+	// Successful response — reset error counter.
+	s.consecutiveErrors = 0
+
 	if resp == nil || resp.Msg == nil {
+		s.consecutiveEmpty++
+		metrics.PollFetchTotal.WithLabelValues(metrics.LabelResultEmpty).Inc()
 		return nil, false
 	}

@@ -179,11 +252,14 @@ func (p *Poller) fetchTask(ctx context.Context) (*runnerv1.Task, bool) {
 	}

 	if resp.Msg.Task == nil {
+		s.consecutiveEmpty++
+		metrics.PollFetchTotal.WithLabelValues(metrics.LabelResultEmpty).Inc()
 		return nil, false
 	}

-	// got a task, set `tasksVersion` to zero to focre query db in next request.
+	// got a task, set `tasksVersion` to zero to force query db in next request.
 	p.tasksVersion.CompareAndSwap(resp.Msg.TasksVersion, 0)

+	metrics.PollFetchTotal.WithLabelValues(metrics.LabelResultTask).Inc()
 	return resp.Msg.Task, true
 }
--- a/internal/app/poll/poller_test.go
+++ b/internal/app/poll/poller_test.go
@@ -0,0 +1,108 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package poll
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	runnerv1 "code.gitea.io/actions-proto-go/runner/v1"
+	connect_go "connectrpc.com/connect"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+
+	"gitea.com/gitea/act_runner/internal/pkg/client/mocks"
+	"gitea.com/gitea/act_runner/internal/pkg/config"
+)
+
+// TestPoller_PerWorkerCounters verifies that each worker maintains its own
+// backoff counters. With a shared counter, N workers each seeing one empty
+// response would inflate the counter to N and trigger an unnecessarily long
+// backoff. With per-worker state, each worker only sees its own count.
+func TestPoller_PerWorkerCounters(t *testing.T) {
+	client := mocks.NewClient(t)
+	client.On("FetchTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
+			// Always return an empty response.
+			return connect_go.NewResponse(&runnerv1.FetchTaskResponse{}), nil
+		},
+	)
+
+	cfg, err := config.LoadDefault("")
+	require.NoError(t, err)
+	p := &Poller{client: client, cfg: cfg}
+
+	ctx := context.Background()
+	s1 := &workerState{}
+	s2 := &workerState{}
+
+	// Each worker independently observes one empty response.
+	_, ok := p.fetchTask(ctx, s1)
+	require.False(t, ok)
+	_, ok = p.fetchTask(ctx, s2)
+	require.False(t, ok)
+
+	assert.Equal(t, int64(1), s1.consecutiveEmpty, "worker 1 should only count its own empty response")
+	assert.Equal(t, int64(1), s2.consecutiveEmpty, "worker 2 should only count its own empty response")
+
+	// Worker 1 sees a second empty; worker 2 stays at 1.
+	_, ok = p.fetchTask(ctx, s1)
+	require.False(t, ok)
+	assert.Equal(t, int64(2), s1.consecutiveEmpty)
+	assert.Equal(t, int64(1), s2.consecutiveEmpty, "worker 2's counter must not be affected by worker 1's empty fetches")
+}
+
+// TestPoller_FetchErrorIncrementsErrorsOnly verifies that a fetch error
+// increments only the per-worker error counter, not the empty counter.
+func TestPoller_FetchErrorIncrementsErrorsOnly(t *testing.T) {
+	client := mocks.NewClient(t)
+	client.On("FetchTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
+			return nil, errors.New("network unreachable")
+		},
+	)
+
+	cfg, err := config.LoadDefault("")
+	require.NoError(t, err)
+	p := &Poller{client: client, cfg: cfg}
+
+	s := &workerState{}
+	_, ok := p.fetchTask(context.Background(), s)
+	require.False(t, ok)
+	assert.Equal(t, int64(1), s.consecutiveErrors)
+	assert.Equal(t, int64(0), s.consecutiveEmpty)
+}
+
+// TestPoller_CalculateInterval verifies the per-worker exponential backoff
+// math is correctly driven by the worker's own counters.
+func TestPoller_CalculateInterval(t *testing.T) {
+	cfg, err := config.LoadDefault("")
+	require.NoError(t, err)
+	cfg.Runner.FetchInterval = 2 * time.Second
+	cfg.Runner.FetchIntervalMax = 60 * time.Second
+	p := &Poller{cfg: cfg}
+
+	cases := []struct {
+		name         string
+		empty, errs  int64
+		wantInterval time.Duration
+	}{
+		{"first poll, no backoff", 0, 0, 2 * time.Second},
+		{"single empty, still base", 1, 0, 2 * time.Second},
+		{"two empties, doubled", 2, 0, 4 * time.Second},
+		{"five empties, capped path", 5, 0, 32 * time.Second},
+		{"many empties, capped at max", 20, 0, 60 * time.Second},
+		{"errors drive backoff too", 0, 3, 8 * time.Second},
+		{"max(empty, errors) wins", 2, 4, 16 * time.Second},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := &workerState{consecutiveEmpty: tc.empty, consecutiveErrors: tc.errs}
+			assert.Equal(t, tc.wantInterval, p.calculateInterval(s))
+		})
+	}
+}
--- a/internal/app/run/runner.go
+++ b/internal/app/run/runner.go
@@ -8,9 +8,11 @@ import (
 	"encoding/json"
 	"fmt"
 	"maps"
+	"os"
 	"path/filepath"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"

 	runnerv1 "code.gitea.io/actions-proto-go/runner/v1"
@@ -25,6 +27,7 @@ import (
 	"gitea.com/gitea/act_runner/internal/pkg/client"
 	"gitea.com/gitea/act_runner/internal/pkg/config"
 	"gitea.com/gitea/act_runner/internal/pkg/labels"
+	"gitea.com/gitea/act_runner/internal/pkg/metrics"
 	"gitea.com/gitea/act_runner/internal/pkg/report"
 	"gitea.com/gitea/act_runner/internal/pkg/ver"
 )
@@ -40,6 +43,7 @@ type Runner struct {
 	envs   map[string]string

 	runningTasks sync.Map
+	runningCount atomic.Int64
 }

 func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client) *Runner {
@@ -95,16 +99,25 @@ func (r *Runner) Run(ctx context.Context, task *runnerv1.Task) error {
 	r.runningTasks.Store(task.Id, struct{}{})
 	defer r.runningTasks.Delete(task.Id)

+	r.runningCount.Add(1)
+
+	start := time.Now()
+
 	ctx, cancel := context.WithTimeout(ctx, r.cfg.Runner.Timeout)
 	defer cancel()
-	reporter := report.NewReporter(ctx, cancel, r.client, task)
+	reporter := report.NewReporter(ctx, cancel, r.client, task, r.cfg)
 	var runErr error
 	defer func() {
+		r.runningCount.Add(-1)
+
 		lastWords := ""
 		if runErr != nil {
 			lastWords = runErr.Error()
 		}
 		_ = reporter.Close(lastWords)
+
+		metrics.JobDuration.Observe(time.Since(start).Seconds())
+		metrics.JobsTotal.WithLabelValues(metrics.ResultToStatusLabel(reporter.Result())).Inc()
 	}()
 	reporter.RunDaemon()
 	runErr = r.run(ctx, task, reporter)
@@ -154,6 +167,7 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.
 		Event:           taskContext["event"].GetStructValue().AsMap(),
 		RunID:           taskContext["run_id"].GetStringValue(),
 		RunNumber:       taskContext["run_number"].GetStringValue(),
+		RunAttempt:      taskContext["run_attempt"].GetStringValue(),
 		Actor:           taskContext["actor"].GetStringValue(),
 		Repository:      taskContext["repository"].GetStringValue(),
 		EventName:       taskContext["event_name"].GetStringValue(),
@@ -196,11 +210,18 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.
 		maxLifetime = time.Until(deadline)
 	}

+	workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/")
+	if r.cfg.Container.BindWorkdir {
+		// Append the task ID to isolate concurrent jobs from the same repo.
+		workdirParent = fmt.Sprintf("%s/%d", workdirParent, task.Id)
+	}
+	workdir := filepath.FromSlash(fmt.Sprintf("/%s/%s", workdirParent, preset.Repository))
+
 	runnerConfig := &runner.Config{
 		// On Linux, Workdir will be like "/<parent_directory>/<owner>/<repo>"
 		// On Windows, Workdir will be like "\<parent_directory>\<owner>\<repo>"
-		Workdir:        filepath.FromSlash(fmt.Sprintf("/%s/%s", strings.TrimLeft(r.cfg.Container.WorkdirParent, "/"), preset.Repository)),
-		BindWorkdir:    false,
+		Workdir:        workdir,
+		BindWorkdir:    r.cfg.Container.BindWorkdir,
 		ActionCacheDir: filepath.FromSlash(r.cfg.Host.WorkdirParent),

 		ReuseContainers:       false,
@@ -245,9 +266,22 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.

 	execErr := executor(ctx)
 	reporter.SetOutputs(job.Outputs)
+
+	if r.cfg.Container.BindWorkdir {
+		// Remove the entire task-specific directory (e.g. /workspace/<task_id>).
+		taskDir := filepath.FromSlash("/" + workdirParent)
+		if err := os.RemoveAll(taskDir); err != nil {
+			log.Warnf("failed to clean up workspace %s: %v", taskDir, err)
+		}
+	}
+
 	return execErr
 }

+func (r *Runner) RunningCount() int64 {
+	return r.runningCount.Load()
+}
+
 func (r *Runner) Declare(ctx context.Context, labels []string) (*connect.Response[runnerv1.DeclareResponse], error) {
 	return r.client.Declare(ctx, connect.NewRequest(&runnerv1.DeclareRequest{
 		Version: ver.Version(),
--- a/internal/pkg/client/http.go
+++ b/internal/pkg/client/http.go
@@ -8,6 +8,7 @@ import (
 	"crypto/tls"
 	"net/http"
 	"strings"
+	"time"

 	"code.gitea.io/actions-proto-go/ping/v1/pingv1connect"
 	"code.gitea.io/actions-proto-go/runner/v1/runnerv1connect"
@@ -15,16 +16,17 @@ import (
 )

 func getHTTPClient(endpoint string, insecure bool) *http.Client {
+	transport := &http.Transport{
+		MaxIdleConns:        10,
+		MaxIdleConnsPerHost: 10, // All requests go to one host; default is 2 which causes frequent reconnects.
+		IdleConnTimeout:     90 * time.Second,
+	}
 	if strings.HasPrefix(endpoint, "https://") && insecure {
-		return &http.Client{
-			Transport: &http.Transport{
-				TLSClientConfig: &tls.Config{
-					InsecureSkipVerify: true,
-				},
-			},
+		transport.TLSClientConfig = &tls.Config{
+			InsecureSkipVerify: true,
 		}
 	}
-	return http.DefaultClient
+	return &http.Client{Transport: transport}
 }

 // New returns a new runner client.
@@ -47,14 +49,15 @@ func New(endpoint string, insecure bool, uuid, token, version string, opts ...co
 		}
 	})))

+	httpClient := getHTTPClient(endpoint, insecure)
 	return &HTTPClient{
 		PingServiceClient: pingv1connect.NewPingServiceClient(
-			getHTTPClient(endpoint, insecure),
+			httpClient,
 			baseURL,
 			opts...,
 		),
 		RunnerServiceClient: runnerv1connect.NewRunnerServiceClient(
-			getHTTPClient(endpoint, insecure),
+			httpClient,
 			baseURL,
 			opts...,
 		),
--- a/internal/pkg/config/config.example.yaml
+++ b/internal/pkg/config/config.example.yaml
@@ -32,6 +32,24 @@ runner:
  fetch_timeout: 5s
  # The interval for fetching the job from the Gitea instance.
  fetch_interval: 2s
+  # The maximum interval for fetching the job from the Gitea instance.
+  # The runner uses exponential backoff when idle, increasing the interval up to this maximum.
+  # Set to 0 or same as fetch_interval to disable backoff.
+  fetch_interval_max: 60s
+  # The base interval for periodic log flush to the Gitea instance.
+  # Logs may be sent earlier if the buffer reaches log_report_batch_size
+  # or if log_report_max_latency expires after the first buffered row.
+  log_report_interval: 5s
+  # The maximum time a log row can wait before being sent.
+  # This ensures even a single log line appears on the frontend within this duration.
+  # Must be less than log_report_interval to have any effect.
+  log_report_max_latency: 3s
+  # Flush logs immediately when the buffer reaches this many rows.
+  # This ensures bursty output (e.g., npm install) is delivered promptly.
+  log_report_batch_size: 100
+  # The interval for reporting task state (step status, timing) to the Gitea instance.
+  # State is also reported immediately on step transitions (start/stop).
+  state_report_interval: 5s
  # The github_mirror of a runner is used to specify the mirror address of the github that pulls the action repository.
  # It works when something like `uses: actions/checkout@v4` is used and DEFAULT_ACTIONS_URL is set to github,
  # and github_mirror is not empty. In this case,
@@ -103,8 +121,23 @@ container:
  require_docker: false
  # Timeout to wait for the docker daemon to be reachable, if docker is required by require_docker or act_runner
  docker_timeout: 0s
+  # Bind the workspace to the host filesystem instead of using Docker volumes.
+  # This is required for Docker-in-Docker (DinD) setups when jobs use docker compose
+  # with bind mounts (e.g., ".:/app"), as volume-based workspaces are not accessible
+  # from the DinD daemon's filesystem. When enabled, ensure the workspace parent
+  # directory is also mounted into the runner container and listed in valid_volumes.
+  bind_workdir: false

 host:
  # The parent directory of a job's working directory.
  # If it's empty, $HOME/.cache/act/ will be used.
  workdir_parent:
+
+metrics:
+  # Enable the Prometheus metrics endpoint.
+  # When enabled, metrics are served at http://<addr>/metrics and a liveness check at /healthz.
+  enabled: false
+  # The address for the metrics HTTP server to listen on.
+  # Defaults to localhost only. Set to ":9101" to allow external access,
+  # but ensure the port is firewall-protected as there is no authentication.
+  addr: "127.0.0.1:9101"
--- a/internal/pkg/config/config.go
+++ b/internal/pkg/config/config.go
@@ -22,17 +22,22 @@ type Log struct {

 // Runner represents the configuration for the runner.
 type Runner struct {
-	File            string            `yaml:"file"`             // File specifies the file path for the runner.
-	Capacity        int               `yaml:"capacity"`         // Capacity specifies the capacity of the runner.
-	Envs            map[string]string `yaml:"envs"`             // Envs stores environment variables for the runner.
-	EnvFile         string            `yaml:"env_file"`         // EnvFile specifies the path to the file containing environment variables for the runner.
-	Timeout         time.Duration     `yaml:"timeout"`          // Timeout specifies the duration for runner timeout.
-	ShutdownTimeout time.Duration     `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
-	Insecure        bool              `yaml:"insecure"`         // Insecure indicates whether the runner operates in an insecure mode.
-	FetchTimeout    time.Duration     `yaml:"fetch_timeout"`    // FetchTimeout specifies the timeout duration for fetching resources.
-	FetchInterval   time.Duration     `yaml:"fetch_interval"`   // FetchInterval specifies the interval duration for fetching resources.
-	Labels          []string          `yaml:"labels"`           // Labels specify the labels of the runner. Labels are declared on each startup
-	GithubMirror    string            `yaml:"github_mirror"`    // GithubMirror defines what mirrors should be used when using github
+	File                string            `yaml:"file"`                   // File specifies the file path for the runner.
+	Capacity            int               `yaml:"capacity"`               // Capacity specifies the capacity of the runner.
+	Envs                map[string]string `yaml:"envs"`                   // Envs stores environment variables for the runner.
+	EnvFile             string            `yaml:"env_file"`               // EnvFile specifies the path to the file containing environment variables for the runner.
+	Timeout             time.Duration     `yaml:"timeout"`                // Timeout specifies the duration for runner timeout.
+	ShutdownTimeout     time.Duration     `yaml:"shutdown_timeout"`       // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
+	Insecure            bool              `yaml:"insecure"`               // Insecure indicates whether the runner operates in an insecure mode.
+	FetchTimeout        time.Duration     `yaml:"fetch_timeout"`          // FetchTimeout specifies the timeout duration for fetching resources.
+	FetchInterval       time.Duration     `yaml:"fetch_interval"`         // FetchInterval specifies the interval duration for fetching resources.
+	FetchIntervalMax    time.Duration     `yaml:"fetch_interval_max"`     // FetchIntervalMax specifies the maximum backoff interval when idle.
+	LogReportInterval   time.Duration     `yaml:"log_report_interval"`    // LogReportInterval specifies the base interval for periodic log flush.
+	LogReportMaxLatency time.Duration     `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
+	LogReportBatchSize  int               `yaml:"log_report_batch_size"`  // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
+	StateReportInterval time.Duration     `yaml:"state_report_interval"`  // StateReportInterval specifies the interval for state reporting.
+	Labels              []string          `yaml:"labels"`                 // Labels specify the labels of the runner. Labels are declared on each startup
+	GithubMirror        string            `yaml:"github_mirror"`          // GithubMirror defines what mirrors should be used when using github
 }

 // Cache represents the configuration for caching.
@@ -57,6 +62,7 @@ type Container struct {
 	ForceRebuild  bool          `yaml:"force_rebuild"`  // Rebuild docker image(s) even if already present
 	RequireDocker bool          `yaml:"require_docker"` // Always require a reachable docker daemon, even if not required by act_runner
 	DockerTimeout time.Duration `yaml:"docker_timeout"` // Timeout to wait for the docker daemon to be reachable, if docker is required by require_docker or act_runner
+	BindWorkdir   bool          `yaml:"bind_workdir"`   // BindWorkdir binds the workspace to the host filesystem instead of using Docker volumes. Required for DinD when jobs use docker compose with bind mounts.
 }

 // Host represents the configuration for the host.
@@ -64,6 +70,12 @@ type Host struct {
 	WorkdirParent string `yaml:"workdir_parent"` // WorkdirParent specifies the parent directory for the host's working directory.
 }

+// Metrics represents the configuration for the Prometheus metrics endpoint.
+type Metrics struct {
+	Enabled bool   `yaml:"enabled"` // Enabled indicates whether the metrics endpoint is exposed.
+	Addr    string `yaml:"addr"`    // Addr specifies the listen address for the metrics HTTP server (e.g., ":9101").
+}
+
 // Config represents the overall configuration.
 type Config struct {
 	Log       Log       `yaml:"log"`       // Log represents the configuration for logging.
@@ -71,6 +83,7 @@ type Config struct {
 	Cache     Cache     `yaml:"cache"`     // Cache represents the configuration for caching.
 	Container Container `yaml:"container"` // Container represents the configuration for the container.
 	Host      Host      `yaml:"host"`      // Host represents the configuration for the host.
+	Metrics   Metrics   `yaml:"metrics"`   // Metrics represents the configuration for the Prometheus metrics endpoint.
 }

 // LoadDefault returns the default configuration.
@@ -136,6 +149,35 @@ func LoadDefault(file string) (*Config, error) {
 	if cfg.Runner.FetchInterval <= 0 {
 		cfg.Runner.FetchInterval = 2 * time.Second
 	}
+	if cfg.Runner.FetchIntervalMax <= 0 {
+		cfg.Runner.FetchIntervalMax = 60 * time.Second
+	}
+	if cfg.Runner.LogReportInterval <= 0 {
+		cfg.Runner.LogReportInterval = 5 * time.Second
+	}
+	if cfg.Runner.LogReportMaxLatency <= 0 {
+		cfg.Runner.LogReportMaxLatency = 3 * time.Second
+	}
+	if cfg.Runner.LogReportBatchSize <= 0 {
+		cfg.Runner.LogReportBatchSize = 100
+	}
+	if cfg.Runner.StateReportInterval <= 0 {
+		cfg.Runner.StateReportInterval = 5 * time.Second
+	}
+	if cfg.Metrics.Addr == "" {
+		cfg.Metrics.Addr = "127.0.0.1:9101"
+	}
+
+	// Validate and fix invalid config combinations to prevent confusing behavior.
+	if cfg.Runner.FetchIntervalMax < cfg.Runner.FetchInterval {
+		log.Warnf("fetch_interval_max (%v) is less than fetch_interval (%v), setting fetch_interval_max to fetch_interval",
+			cfg.Runner.FetchIntervalMax, cfg.Runner.FetchInterval)
+		cfg.Runner.FetchIntervalMax = cfg.Runner.FetchInterval
+	}
+	if cfg.Runner.LogReportMaxLatency >= cfg.Runner.LogReportInterval {
+		log.Warnf("log_report_max_latency (%v) >= log_report_interval (%v), the max-latency timer will never fire before the periodic ticker; consider lowering log_report_max_latency",
+			cfg.Runner.LogReportMaxLatency, cfg.Runner.LogReportInterval)
+	}

 	// although `container.network_mode` will be deprecated, but we have to be compatible with it for now.
 	if cfg.Container.NetworkMode != "" && cfg.Container.Network == "" {
--- a/internal/pkg/metrics/metrics.go
+++ b/internal/pkg/metrics/metrics.go
@@ -0,0 +1,216 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package metrics
+
+import (
+	"sync"
+	"time"
+
+	runnerv1 "code.gitea.io/actions-proto-go/runner/v1"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/collectors"
+)
+
+// Namespace is the Prometheus namespace for all act_runner metrics.
+const Namespace = "act_runner"
+
+// Label value constants for Prometheus metrics.
+// Using constants prevents typos from silently creating new time-series.
+//
+// LabelResult* values are used on metrics with label key "result" (RPC outcomes).
+// LabelStatus* values are used on metrics with label key "status" (job outcomes).
+const (
+	LabelResultTask    = "task"
+	LabelResultEmpty   = "empty"
+	LabelResultError   = "error"
+	LabelResultSuccess = "success"
+
+	LabelMethodFetchTask  = "FetchTask"
+	LabelMethodUpdateLog  = "UpdateLog"
+	LabelMethodUpdateTask = "UpdateTask"
+
+	LabelStatusSuccess   = "success"
+	LabelStatusFailure   = "failure"
+	LabelStatusCancelled = "cancelled"
+	LabelStatusSkipped   = "skipped"
+	LabelStatusUnknown   = "unknown"
+)
+
+// rpcDurationBuckets covers the expected latency range for short-running
+// UpdateLog / UpdateTask RPCs. FetchTask uses its own buckets (it has a 10s tail).
+var rpcDurationBuckets = []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5}
+
+// ResultToStatusLabel maps a runnerv1.Result to the "status" label value used on job metrics.
+func ResultToStatusLabel(r runnerv1.Result) string {
+	switch r {
+	case runnerv1.Result_RESULT_SUCCESS:
+		return LabelStatusSuccess
+	case runnerv1.Result_RESULT_FAILURE:
+		return LabelStatusFailure
+	case runnerv1.Result_RESULT_CANCELLED:
+		return LabelStatusCancelled
+	case runnerv1.Result_RESULT_SKIPPED:
+		return LabelStatusSkipped
+	default:
+		return LabelStatusUnknown
+	}
+}
+
+var (
+	RunnerInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: Namespace,
+		Name:      "info",
+		Help:      "Runner metadata. Always 1. Labels carry version and name.",
+	}, []string{"version", "name"})
+
+	RunnerCapacity = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: Namespace,
+		Name:      "capacity",
+		Help:      "Configured maximum concurrent jobs.",
+	})
+
+	PollFetchTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: Namespace,
+		Subsystem: "poll",
+		Name:      "fetch_total",
+		Help:      "Total number of FetchTask RPCs by result (task, empty, error).",
+	}, []string{"result"})
+
+	PollFetchDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: Namespace,
+		Subsystem: "poll",
+		Name:      "fetch_duration_seconds",
+		Help:      "Latency of FetchTask RPCs, excluding expected long-poll timeouts.",
+		Buckets:   []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10},
+	})
+
+	PollBackoffSeconds = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: Namespace,
+		Subsystem: "poll",
+		Name:      "backoff_seconds",
+		Help:      "Last observed polling backoff interval. With Capacity > 1, reflects whichever worker wrote last.",
+	})
+
+	JobsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: Namespace,
+		Subsystem: "job",
+		Name:      "total",
+		Help:      "Total jobs processed by status (success, failure, cancelled, skipped, unknown).",
+	}, []string{"status"})
+
+	JobDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: Namespace,
+		Subsystem: "job",
+		Name:      "duration_seconds",
+		Help:      "Duration of job execution from start to finish.",
+		Buckets:   prometheus.ExponentialBuckets(1, 2, 14), // 1s to ~4.5h
+	})
+
+	ReportLogTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: Namespace,
+		Subsystem: "report",
+		Name:      "log_total",
+		Help:      "Total UpdateLog RPCs by result (success, error).",
+	}, []string{"result"})
+
+	ReportLogDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: Namespace,
+		Subsystem: "report",
+		Name:      "log_duration_seconds",
+		Help:      "Latency of UpdateLog RPCs.",
+		Buckets:   rpcDurationBuckets,
+	})
+
+	ReportStateTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: Namespace,
+		Subsystem: "report",
+		Name:      "state_total",
+		Help:      "Total UpdateTask (state) RPCs by result (success, error).",
+	}, []string{"result"})
+
+	ReportStateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: Namespace,
+		Subsystem: "report",
+		Name:      "state_duration_seconds",
+		Help:      "Latency of UpdateTask RPCs.",
+		Buckets:   rpcDurationBuckets,
+	})
+
+	ReportLogBufferRows = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: Namespace,
+		Subsystem: "report",
+		Name:      "log_buffer_rows",
+		Help:      "Current number of buffered log rows awaiting send.",
+	})
+
+	ClientErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: Namespace,
+		Subsystem: "client",
+		Name:      "errors_total",
+		Help:      "Total client RPC errors by method.",
+	}, []string{"method"})
+)
+
+// Registry is the custom Prometheus registry used by the runner.
+var Registry = prometheus.NewRegistry()
+
+var initOnce sync.Once
+
+// Init registers all static metrics and the standard Go/process collectors.
+// Safe to call multiple times; only the first call has effect.
+func Init() {
+	initOnce.Do(func() {
+		Registry.MustRegister(
+			collectors.NewGoCollector(),
+			collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
+			RunnerInfo, RunnerCapacity,
+			PollFetchTotal, PollFetchDuration, PollBackoffSeconds,
+			JobsTotal, JobDuration,
+			ReportLogTotal, ReportLogDuration,
+			ReportStateTotal, ReportStateDuration, ReportLogBufferRows,
+			ClientErrors,
+		)
+	})
+}
+
+// RegisterUptimeFunc registers a GaugeFunc that reports seconds since startTime.
+func RegisterUptimeFunc(startTime time.Time) {
+	Registry.MustRegister(prometheus.NewGaugeFunc(
+		prometheus.GaugeOpts{
+			Namespace: Namespace,
+			Name:      "uptime_seconds",
+			Help:      "Seconds since the runner daemon started.",
+		},
+		func() float64 { return time.Since(startTime).Seconds() },
+	))
+}
+
+// RegisterRunningJobsFunc registers GaugeFuncs for the running job count and
+// capacity utilisation ratio, evaluated lazily at Prometheus scrape time.
+func RegisterRunningJobsFunc(countFn func() int64, capacity int) {
+	capF := float64(capacity)
+	Registry.MustRegister(prometheus.NewGaugeFunc(
+		prometheus.GaugeOpts{
+			Namespace: Namespace,
+			Subsystem: "job",
+			Name:      "running",
+			Help:      "Number of jobs currently executing.",
+		},
+		func() float64 { return float64(countFn()) },
+	))
+	Registry.MustRegister(prometheus.NewGaugeFunc(
+		prometheus.GaugeOpts{
+			Namespace: Namespace,
+			Subsystem: "job",
+			Name:      "capacity_utilization_ratio",
+			Help:      "Ratio of running jobs to configured capacity (0.0-1.0).",
+		},
+		func() float64 {
+			if capF <= 0 {
+				return 0
+			}
+			return float64(countFn()) / capF
+		},
+	))
+}
--- a/internal/pkg/metrics/server.go
+++ b/internal/pkg/metrics/server.go
@@ -0,0 +1,50 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package metrics
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+	log "github.com/sirupsen/logrus"
+)
+
+// StartServer starts an HTTP server that serves Prometheus metrics on /metrics
+// and a liveness check on /healthz. The server shuts down when ctx is cancelled.
+// Call Init() before StartServer to register metrics with the Registry.
+func StartServer(ctx context.Context, addr string) {
+	mux := http.NewServeMux()
+	mux.Handle("/metrics", promhttp.HandlerFor(Registry, promhttp.HandlerOpts{}))
+	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("ok"))
+	})
+
+	srv := &http.Server{
+		Addr:              addr,
+		Handler:           mux,
+		ReadHeaderTimeout: 5 * time.Second,
+		ReadTimeout:       10 * time.Second,
+		WriteTimeout:      10 * time.Second,
+		IdleTimeout:       60 * time.Second,
+	}
+
+	go func() {
+		log.Infof("metrics server listening on %s", addr)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.WithError(err).Error("metrics server failed")
+		}
+	}()
+
+	go func() {
+		<-ctx.Done()
+		shutCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		if err := srv.Shutdown(shutCtx); err != nil {
+			log.WithError(err).Warn("metrics server shutdown error")
+		}
+	}()
+}
--- a/internal/pkg/report/reporter.go
+++ b/internal/pkg/report/reporter.go
@@ -20,6 +20,8 @@ import (
 	"google.golang.org/protobuf/types/known/timestamppb"

 	"gitea.com/gitea/act_runner/internal/pkg/client"
+	"gitea.com/gitea/act_runner/internal/pkg/config"
+	"gitea.com/gitea/act_runner/internal/pkg/metrics"
 )

 type Reporter struct {
@@ -35,16 +37,32 @@ type Reporter struct {
 	logReplacer *strings.Replacer
 	oldnew      []string

-	state   *runnerv1.TaskState
-	stateMu sync.RWMutex
-	outputs sync.Map
-	daemon  chan struct{}
+	// lastLogBufferRows is the last value written to the ReportLogBufferRows
+	// gauge; guarded by clientM (the same lock held around each ReportLog call)
+	// so the gauge skips no-op Set calls when the buffer size is unchanged.
+	lastLogBufferRows int
+
+	state        *runnerv1.TaskState
+	stateChanged bool
+	stateMu      sync.RWMutex
+	outputs      sync.Map
+	daemon       chan struct{}
+
+	// Adaptive batching control
+	logReportInterval   time.Duration
+	logReportMaxLatency time.Duration
+	logBatchSize        int
+	stateReportInterval time.Duration
+
+	// Event notification channels (non-blocking, buffered 1)
+	logNotify   chan struct{} // signal: new log rows arrived
+	stateNotify chan struct{} // signal: step transition (start/stop)

 	debugOutputEnabled  bool
 	stopCommandEndToken string
 }

-func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.Client, task *runnerv1.Task) *Reporter {
+func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.Client, task *runnerv1.Task, cfg *config.Config) *Reporter {
 	var oldnew []string
 	if v := task.Context.Fields["token"].GetStringValue(); v != "" {
 		oldnew = append(oldnew, v, "***")
@@ -57,11 +75,17 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C
 	}

 	rv := &Reporter{
-		ctx:         ctx,
-		cancel:      cancel,
-		client:      client,
-		oldnew:      oldnew,
-		logReplacer: strings.NewReplacer(oldnew...),
+		ctx:                 ctx,
+		cancel:              cancel,
+		client:              client,
+		oldnew:              oldnew,
+		logReplacer:         strings.NewReplacer(oldnew...),
+		logReportInterval:   cfg.Runner.LogReportInterval,
+		logReportMaxLatency: cfg.Runner.LogReportMaxLatency,
+		logBatchSize:        cfg.Runner.LogReportBatchSize,
+		stateReportInterval: cfg.Runner.StateReportInterval,
+		logNotify:           make(chan struct{}, 1),
+		stateNotify:         make(chan struct{}, 1),
 		state: &runnerv1.TaskState{
 			Id: task.Id,
 		},
@@ -75,6 +99,13 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C
 	return rv
 }

+// Result returns the final job result. Safe to call after Close() returns.
+func (r *Reporter) Result() runnerv1.Result {
+	r.stateMu.RLock()
+	defer r.stateMu.RUnlock()
+	return r.state.Result
+}
+
 func (r *Reporter) ResetSteps(l int) {
 	r.stateMu.Lock()
 	defer r.stateMu.Unlock()
@@ -108,11 +139,42 @@ func isJobStepEntry(entry *log.Entry) bool {
 	return true
 }

-func (r *Reporter) Fire(entry *log.Entry) error {
-	r.stateMu.Lock()
-	defer r.stateMu.Unlock()
+// notifyLog sends a non-blocking signal that new log rows are available.
+func (r *Reporter) notifyLog() {
+	select {
+	case r.logNotify <- struct{}{}:
+	default:
+	}
+}

-	log.WithFields(entry.Data).Trace(entry.Message)
+// notifyState sends a non-blocking signal that a UX-critical state change occurred (step start/stop, job result).
+func (r *Reporter) notifyState() {
+	select {
+	case r.stateNotify <- struct{}{}:
+	default:
+	}
+}
+
+// unlockAndNotify releases stateMu and sends channel notifications.
+// Must be called with stateMu held.
+func (r *Reporter) unlockAndNotify(urgentState bool) {
+	r.stateMu.Unlock()
+	r.notifyLog()
+	if urgentState {
+		r.notifyState()
+	}
+}
+
+func (r *Reporter) Fire(entry *log.Entry) error {
+	urgentState := false
+
+	r.stateMu.Lock()
+
+	r.stateChanged = true
+
+	if log.IsLevelEnabled(log.TraceLevel) {
+		log.WithFields(entry.Data).Trace(entry.Message)
+	}

 	timestamp := entry.Time
 	if r.state.StartedAt == nil {
@@ -135,11 +197,13 @@ func (r *Reporter) Fire(entry *log.Entry) error {
 						}
 					}
 				}
+				urgentState = true
 			}
 		}
 		if !r.duringSteps() {
 			r.logRows = appendIfNotNil(r.logRows, r.parseLogRow(entry))
 		}
+		r.unlockAndNotify(urgentState)
 		return nil
 	}

@@ -153,11 +217,13 @@ func (r *Reporter) Fire(entry *log.Entry) error {
 		if !r.duringSteps() {
 			r.logRows = appendIfNotNil(r.logRows, r.parseLogRow(entry))
 		}
+		r.unlockAndNotify(false)
 		return nil
 	}

 	if step.StartedAt == nil {
 		step.StartedAt = timestamppb.New(timestamp)
+		urgentState = true
 	}

 	// Force reporting log errors as raw output to prevent silent failures
@@ -185,26 +251,91 @@ func (r *Reporter) Fire(entry *log.Entry) error {
 			}
 			step.Result = stepResult
 			step.StoppedAt = timestamppb.New(timestamp)
+			urgentState = true
 		}
 	}

+	r.unlockAndNotify(urgentState)
 	return nil
 }

 func (r *Reporter) RunDaemon() {
-	r.stateMu.RLock()
-	closed := r.closed
-	r.stateMu.RUnlock()
-	if closed || r.ctx.Err() != nil {
-		// Acknowledge close
-		close(r.daemon)
-		return
+	go r.runDaemonLoop()
+}
+
+func (r *Reporter) stopLatencyTimer(active *bool, timer *time.Timer) {
+	if *active {
+		if !timer.Stop() {
+			select {
+			case <-timer.C:
+			default:
+			}
+		}
+		*active = false
 	}
+}

-	_ = r.ReportLog(false)
-	_ = r.ReportState(false)
+func (r *Reporter) runDaemonLoop() {
+	logTicker := time.NewTicker(r.logReportInterval)
+	stateTicker := time.NewTicker(r.stateReportInterval)

-	time.AfterFunc(time.Second, r.RunDaemon)
+	// maxLatencyTimer ensures the first buffered log row is sent within logReportMaxLatency.
+	// Start inactive — it is armed when the first log row arrives in an empty buffer.
+	maxLatencyTimer := time.NewTimer(0)
+	if !maxLatencyTimer.Stop() {
+		<-maxLatencyTimer.C
+	}
+	maxLatencyActive := false
+
+	defer logTicker.Stop()
+	defer stateTicker.Stop()
+	defer maxLatencyTimer.Stop()
+
+	for {
+		select {
+		case <-logTicker.C:
+			_ = r.ReportLog(false)
+			r.stopLatencyTimer(&maxLatencyActive, maxLatencyTimer)
+
+		case <-stateTicker.C:
+			_ = r.ReportState(false)
+
+		case <-r.logNotify:
+			r.stateMu.RLock()
+			n := len(r.logRows)
+			r.stateMu.RUnlock()
+
+			if n >= r.logBatchSize {
+				_ = r.ReportLog(false)
+				r.stopLatencyTimer(&maxLatencyActive, maxLatencyTimer)
+			} else if !maxLatencyActive && n > 0 {
+				maxLatencyTimer.Reset(r.logReportMaxLatency)
+				maxLatencyActive = true
+			}
+
+		case <-r.stateNotify:
+			// Step transition or job result — flush both immediately for frontend UX.
+			_ = r.ReportLog(false)
+			_ = r.ReportState(false)
+			r.stopLatencyTimer(&maxLatencyActive, maxLatencyTimer)
+
+		case <-maxLatencyTimer.C:
+			maxLatencyActive = false
+			_ = r.ReportLog(false)
+
+		case <-r.ctx.Done():
+			close(r.daemon)
+			return
+		}
+
+		r.stateMu.RLock()
+		closed := r.closed
+		r.stateMu.RUnlock()
+		if closed {
+			close(r.daemon)
+			return
+		}
+	}
 }

 func (r *Reporter) Logf(format string, a ...any) {
@@ -268,6 +399,10 @@ func (r *Reporter) Close(lastWords string) error {
 		})
 	}
 	r.stateMu.Unlock()
+
+	// Wake up the daemon loop so it detects closed promptly.
+	r.notifyLog()
+
 	// Wait for Acknowledge
 	select {
 	case <-r.daemon:
@@ -295,15 +430,24 @@ func (r *Reporter) ReportLog(noMore bool) error {
 	rows := r.logRows
 	r.stateMu.RUnlock()

+	if !noMore && len(rows) == 0 {
+		return nil
+	}
+
+	start := time.Now()
 	resp, err := r.client.UpdateLog(r.ctx, connect.NewRequest(&runnerv1.UpdateLogRequest{
 		TaskId: r.state.Id,
 		Index:  int64(r.logOffset),
 		Rows:   rows,
 		NoMore: noMore,
 	}))
+	metrics.ReportLogDuration.Observe(time.Since(start).Seconds())
 	if err != nil {
+		metrics.ReportLogTotal.WithLabelValues(metrics.LabelResultError).Inc()
+		metrics.ClientErrors.WithLabelValues(metrics.LabelMethodUpdateLog).Inc()
 		return err
 	}
+	metrics.ReportLogTotal.WithLabelValues(metrics.LabelResultSuccess).Inc()

 	ack := int(resp.Msg.AckIndex)
 	if ack < r.logOffset {
@@ -314,7 +458,12 @@ func (r *Reporter) ReportLog(noMore bool) error {
 	r.logRows = r.logRows[ack-r.logOffset:]
 	submitted := r.logOffset + len(rows)
 	r.logOffset = ack
+	remaining := len(r.logRows)
 	r.stateMu.Unlock()
+	if remaining != r.lastLogBufferRows {
+		metrics.ReportLogBufferRows.Set(float64(remaining))
+		r.lastLogBufferRows = remaining
+	}

 	if noMore && ack < submitted {
 		return errors.New("not all logs are submitted")
@@ -329,15 +478,7 @@ func (r *Reporter) ReportState(reportResult bool) error {
 	r.clientM.Lock()
 	defer r.clientM.Unlock()

-	r.stateMu.RLock()
-	state := proto.Clone(r.state).(*runnerv1.TaskState)
-	r.stateMu.RUnlock()
-
-	// Only report result from Close to reliable sent logs
-	if !reportResult {
-		state.Result = runnerv1.Result_RESULT_UNSPECIFIED
-	}
-
+	// Build the outputs map first (single Range pass instead of two).
 	outputs := make(map[string]string)
 	r.outputs.Range(func(k, v any) bool {
 		if val, ok := v.(string); ok {
@@ -346,13 +487,36 @@ func (r *Reporter) ReportState(reportResult bool) error {
 		return true
 	})

+	// Consume stateChanged atomically with the snapshot; restored on error
+	// below so a concurrent Fire() during UpdateTask isn't silently lost.
+	r.stateMu.Lock()
+	if !reportResult && !r.stateChanged && len(outputs) == 0 {
+		r.stateMu.Unlock()
+		return nil
+	}
+	state := proto.Clone(r.state).(*runnerv1.TaskState)
+	r.stateChanged = false
+	r.stateMu.Unlock()
+
+	if !reportResult {
+		state.Result = runnerv1.Result_RESULT_UNSPECIFIED
+	}
+
+	start := time.Now()
 	resp, err := r.client.UpdateTask(r.ctx, connect.NewRequest(&runnerv1.UpdateTaskRequest{
 		State:   state,
 		Outputs: outputs,
 	}))
+	metrics.ReportStateDuration.Observe(time.Since(start).Seconds())
 	if err != nil {
+		metrics.ReportStateTotal.WithLabelValues(metrics.LabelResultError).Inc()
+		metrics.ClientErrors.WithLabelValues(metrics.LabelMethodUpdateTask).Inc()
+		r.stateMu.Lock()
+		r.stateChanged = true
+		r.stateMu.Unlock()
 		return err
 	}
+	metrics.ReportStateTotal.WithLabelValues(metrics.LabelResultSuccess).Inc()

 	for _, k := range resp.Msg.SentOutputs {
 		r.outputs.Store(k, struct{}{})
--- a/internal/pkg/report/reporter_test.go
+++ b/internal/pkg/report/reporter_test.go
@@ -6,8 +6,9 @@ package report
 import (
 	"context"
 	"errors"
+	"fmt"
 	"strings"
-	"sync"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -21,6 +22,7 @@ import (
 	"google.golang.org/protobuf/types/known/timestamppb"

 	"gitea.com/gitea/act_runner/internal/pkg/client/mocks"
+	"gitea.com/gitea/act_runner/internal/pkg/config"
 )

 func TestReporter_parseLogRow(t *testing.T) {
@@ -175,9 +177,10 @@ func TestReporter_Fire(t *testing.T) {
 		ctx, cancel := context.WithCancel(context.Background())
 		taskCtx, err := structpb.NewStruct(map[string]any{})
 		require.NoError(t, err)
+		cfg, _ := config.LoadDefault("")
 		reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{
 			Context: taskCtx,
-		})
+		}, cfg)
 		reporter.RunDaemon()
 		defer func() {
 			require.NoError(t, reporter.Close(""))
@@ -252,7 +255,8 @@ func TestReporter_EphemeralRunnerDeletion(t *testing.T) {
 	defer cancel()
 	taskCtx, err := structpb.NewStruct(map[string]any{})
 	require.NoError(t, err)
-	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx})
+	cfg, _ := config.LoadDefault("")
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
 	reporter.ResetSteps(1)

 	// Fire a log entry to create pending data
@@ -315,23 +319,281 @@ func TestReporter_RunDaemonClose_Race(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	taskCtx, err := structpb.NewStruct(map[string]any{})
 	require.NoError(t, err)
+	cfg, _ := config.LoadDefault("")
 	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{
 		Context: taskCtx,
-	})
+	}, cfg)
 	reporter.ResetSteps(1)

-	// Start the daemon loop in a separate goroutine.
-	// RunDaemon reads r.closed and reschedules itself via time.AfterFunc.
-	var wg sync.WaitGroup
-	wg.Go(func() {
-		reporter.RunDaemon()
-	})
+	// Start the daemon loop — RunDaemon spawns a goroutine internally.
+	reporter.RunDaemon()

-	// Close concurrently — this races with RunDaemon on r.closed.
+	// Close concurrently — this races with the daemon goroutine on r.closed.
 	require.NoError(t, reporter.Close(""))

-	// Cancel context so pending AfterFunc callbacks exit quickly.
+	// Cancel context so the daemon goroutine exits cleanly.
 	cancel()
-	wg.Wait()
-	time.Sleep(2 * time.Second)
+}
+
+// TestReporter_MaxLatencyTimer verifies that the maxLatencyTimer flushes a
+// single buffered log row before the periodic logTicker fires.
+//
+// Setup: logReportInterval=10s (effectively never), maxLatency=100ms.
+// Fire one log line, then assert UpdateLog is called within 500ms.
+func TestReporter_MaxLatencyTimer(t *testing.T) {
+	var updateLogCalls atomic.Int64
+
+	client := mocks.NewClient(t)
+	client.On("UpdateLog", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
+			updateLogCalls.Add(1)
+			return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
+				AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
+			}), nil
+		},
+	)
+	client.On("UpdateTask", mock.Anything, mock.Anything).Maybe().Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+
+	// Custom config: logTicker=10s (won't fire during test), maxLatency=100ms
+	cfg, _ := config.LoadDefault("")
+	cfg.Runner.LogReportInterval = 10 * time.Second
+	cfg.Runner.LogReportMaxLatency = 100 * time.Millisecond
+	cfg.Runner.LogReportBatchSize = 1000 // won't trigger batch flush
+
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(1)
+	reporter.RunDaemon()
+	defer func() {
+		_ = reporter.Close("")
+	}()
+
+	// Fire a single log line — not enough to trigger batch flush
+	require.NoError(t, reporter.Fire(&log.Entry{
+		Message: "single log line",
+		Data:    log.Fields{"stage": "Main", "stepNumber": 0, "raw_output": true},
+	}))
+
+	// maxLatencyTimer should flush within ~100ms. Wait up to 500ms.
+	assert.Eventually(t, func() bool {
+		return updateLogCalls.Load() > 0
+	}, 500*time.Millisecond, 10*time.Millisecond,
+		"maxLatencyTimer should have flushed the log before logTicker (10s)")
+}
+
+// TestReporter_BatchSizeFlush verifies that reaching logBatchSize triggers
+// an immediate log flush without waiting for any timer.
+func TestReporter_BatchSizeFlush(t *testing.T) {
+	var updateLogCalls atomic.Int64
+
+	client := mocks.NewClient(t)
+	client.On("UpdateLog", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
+			updateLogCalls.Add(1)
+			return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
+				AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
+			}), nil
+		},
+	)
+	client.On("UpdateTask", mock.Anything, mock.Anything).Maybe().Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+
+	// Custom config: large timers, small batch size
+	cfg, _ := config.LoadDefault("")
+	cfg.Runner.LogReportInterval = 10 * time.Second
+	cfg.Runner.LogReportMaxLatency = 10 * time.Second
+	cfg.Runner.LogReportBatchSize = 5
+
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(1)
+	reporter.RunDaemon()
+	defer func() {
+		_ = reporter.Close("")
+	}()
+
+	// Fire exactly batchSize log lines
+	for i := range 5 {
+		require.NoError(t, reporter.Fire(&log.Entry{
+			Message: fmt.Sprintf("log line %d", i),
+			Data:    log.Fields{"stage": "Main", "stepNumber": 0, "raw_output": true},
+		}))
+	}
+
+	// Batch threshold should trigger immediate flush
+	assert.Eventually(t, func() bool {
+		return updateLogCalls.Load() > 0
+	}, 500*time.Millisecond, 10*time.Millisecond,
+		"batch size threshold should have triggered immediate flush")
+}
+
+// TestReporter_StateChangedNotLostDuringReport asserts that a Fire() arriving
+// mid-UpdateTask re-dirties the flag so the change is picked up by the next report.
+func TestReporter_StateChangedNotLostDuringReport(t *testing.T) {
+	var updateTaskCalls atomic.Int64
+	inFlight := make(chan struct{})
+	release := make(chan struct{})
+
+	client := mocks.NewClient(t)
+	client.On("UpdateTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			n := updateTaskCalls.Add(1)
+			if n == 1 {
+				// Signal that the first UpdateTask is in flight, then block until released.
+				close(inFlight)
+				<-release
+			}
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+	cfg, _ := config.LoadDefault("")
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(2)
+
+	// Mark stateChanged=true so the first ReportState proceeds to UpdateTask.
+	reporter.stateMu.Lock()
+	reporter.stateChanged = true
+	reporter.stateMu.Unlock()
+
+	// Kick off the first ReportState in a goroutine — it will block in UpdateTask.
+	done := make(chan error, 1)
+	go func() {
+		done <- reporter.ReportState(false)
+	}()
+
+	// Wait until UpdateTask is in flight (snapshot taken, flag consumed).
+	<-inFlight
+
+	// Concurrent Fire() modifies state — must re-flip stateChanged so the
+	// change is not lost when the in-flight ReportState finishes.
+	require.NoError(t, reporter.Fire(&log.Entry{
+		Message: "step starts",
+		Data:    log.Fields{"stage": "Main", "stepNumber": 1, "raw_output": true},
+	}))
+
+	// Release the in-flight UpdateTask and wait for it to return.
+	close(release)
+	require.NoError(t, <-done)
+
+	// stateChanged must still be true so the next ReportState picks up the
+	// concurrent Fire()'s change instead of skipping via the early-return path.
+	reporter.stateMu.RLock()
+	changed := reporter.stateChanged
+	reporter.stateMu.RUnlock()
+	assert.True(t, changed, "stateChanged must remain true after a concurrent Fire() during in-flight ReportState")
+
+	// And the next ReportState must actually send a second UpdateTask.
+	require.NoError(t, reporter.ReportState(false))
+	assert.Equal(t, int64(2), updateTaskCalls.Load(), "concurrent Fire() change must trigger a second UpdateTask, not be silently lost")
+}
+
+// TestReporter_StateChangedRestoredOnError verifies that when UpdateTask fails,
+// the dirty flag is restored so the snapshotted change isn't silently lost.
+func TestReporter_StateChangedRestoredOnError(t *testing.T) {
+	var updateTaskCalls atomic.Int64
+
+	client := mocks.NewClient(t)
+	client.On("UpdateTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			n := updateTaskCalls.Add(1)
+			if n == 1 {
+				return nil, errors.New("transient network error")
+			}
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+	cfg, _ := config.LoadDefault("")
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(1)
+
+	reporter.stateMu.Lock()
+	reporter.stateChanged = true
+	reporter.stateMu.Unlock()
+
+	// First ReportState fails — flag must be restored to true.
+	require.Error(t, reporter.ReportState(false))
+
+	reporter.stateMu.RLock()
+	changed := reporter.stateChanged
+	reporter.stateMu.RUnlock()
+	assert.True(t, changed, "stateChanged must be restored to true after UpdateTask error so the change is retried")
+
+	// The next ReportState should still issue a request because the flag was restored.
+	require.NoError(t, reporter.ReportState(false))
+	assert.Equal(t, int64(2), updateTaskCalls.Load())
+}
+
+// TestReporter_StateNotifyFlush verifies that step transitions trigger
+// an immediate state flush via the stateNotify channel.
+func TestReporter_StateNotifyFlush(t *testing.T) {
+	var updateTaskCalls atomic.Int64
+
+	client := mocks.NewClient(t)
+	client.On("UpdateLog", mock.Anything, mock.Anything).Maybe().Return(
+		func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
+			return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
+				AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
+			}), nil
+		},
+	)
+	client.On("UpdateTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			updateTaskCalls.Add(1)
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+
+	// Custom config: large state interval so only stateNotify can trigger
+	cfg, _ := config.LoadDefault("")
+	cfg.Runner.StateReportInterval = 10 * time.Second
+	cfg.Runner.LogReportInterval = 10 * time.Second
+
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(1)
+	reporter.RunDaemon()
+	defer func() {
+		_ = reporter.Close("")
+	}()
+
+	// Fire a log entry that starts a step — this triggers stateNotify
+	require.NoError(t, reporter.Fire(&log.Entry{
+		Message: "step starting",
+		Data:    log.Fields{"stage": "Main", "stepNumber": 0, "raw_output": true},
+	}))
+
+	// stateNotify should trigger immediate UpdateTask call
+	assert.Eventually(t, func() bool {
+		return updateTaskCalls.Load() > 0
+	}, 500*time.Millisecond, 10*time.Millisecond,
+		"step transition should have triggered immediate state flush via stateNotify")
 }
Author	SHA1	Message	Date
Bo-Yi Wu	40dcee0991	chore(deps): upgrade golangci-lint from v2.10.1 to v2.11.4 (#821 ) ## Summary - Bump golangci-lint from v2.10.1 to v2.11.4 - Remove unused `//nolint:revive` directive on metrics package declaration (detected by stricter nolintlint in new version) ## Changes between v2.10.1 and v2.11.4 - v2.11.0 — Multiple linter dependency upgrades, Go 1.26 support - v2.11.2 — Bug fix for `fmt` with path - v2.11.3 — gosec update - v2.11.4 — Dependency updates (sqlclosecheck, noctx, etc.) No breaking changes. Reviewed-on: https://gitea.com/gitea/act_runner/pulls/821 Co-authored-by: Bo-Yi Wu <appleboy.tw@gmail.com> Co-committed-by: Bo-Yi Wu <appleboy.tw@gmail.com>	2026-04-15 03:56:34 +00:00
Bo-Yi Wu	f33e5a6245	feat: add Prometheus metrics endpoint for runner observability (#820 ) ## What Add an optional Prometheus `/metrics` HTTP endpoint to `act_runner` so operators can observe runner health, polling behavior, job outcomes, and RPC latency without scraping logs. New surface: - `internal/pkg/metrics/metrics.go` — metric definitions, custom `Registry`, static Go/process collectors, label constants, `ResultToStatusLabel` helper. - `internal/pkg/metrics/server.go` — hardened `http.Server` serving `/metrics` and `/healthz` with Slowloris-safe timeouts (`ReadHeaderTimeout` 5s, `ReadTimeout`/`WriteTimeout` 10s, `IdleTimeout` 60s) and a 5s graceful shutdown. - `daemon.go` wires it up behind `cfg.Metrics.Enabled` (disabled by default). - `poller.go` / `reporter.go` / `runner.go` instrument their existing hot paths with counters/histograms/gauges — no behavior change. Metrics exported (namespace `act_runner_`): \| Subsystem \| Metric \| Type \| Labels \| \|---\|---\|---\|---\| \| — \| `info` \| Gauge \| `version`, `name` \| \| — \| `capacity`, `uptime_seconds` \| Gauge \| — \| \| `poll` \| `fetch_total`, `client_errors_total` \| Counter \| `result` / `method` \| \| `poll` \| `fetch_duration_seconds`, `backoff_seconds` \| Histogram / Gauge \| — \| \| `job` \| `total` \| Counter \| `status` \| \| `job` \| `duration_seconds`, `running`, `capacity_utilization_ratio` \| Histogram / GaugeFunc \| — \| \| `report` \| `log_total`, `state_total` \| Counter \| `result` \| \| `report` \| `log_duration_seconds`, `state_duration_seconds` \| Histogram \| — \| \| `report` \| `log_buffer_rows` \| Gauge \| — \| \| — \| `go_`, `process_` \| standard collectors \| — \| All label values are predefined constants — no high-cardinality labels (no task IDs, repo URLs, branches, tokens, or secrets) so scraping is safe and bounded. ## Why Teams self-hosting Gitea + `act_runner` at scale need to answer basic SRE questions that are currently invisible: - How often are RPCs failing? Which RPC? (`act_runner_client_errors_total`) - Are runners saturated? (`act_runner_job_capacity_utilization_ratio`, `act_runner_job_running`) - How long do jobs take? (`act_runner_job_duration_seconds`) - Is polling backing off? (`act_runner_poll_backoff_seconds`, `act_runner_poll_fetch_total{result=\"error\"}`) - Are log/state reports slow? (`act_runner_report_{log,state}_duration_seconds`) - Is the log buffer draining? (`act_runner_report_log_buffer_rows`) Today operators have to grep logs. This PR makes all of the above first-class metrics so they can feed dashboards and alerts (`rate(act_runner_client_errors_total[5m]) > 0.1`, capacity saturation alerts, etc.). The endpoint is disabled by default and binds to `127.0.0.1:9101` when enabled, so it's opt-in and safe for existing deployments. ## How ### Config ```yaml metrics: enabled: false # opt-in addr: 127.0.0.1:9101 # change to 0.0.0.0:9101 only behind a reverse proxy ``` `config.example.yaml` documents both fields plus a security note about binding externally without auth. ### Wiring 1. `daemon.go` calls `metrics.Init()` (guarded by `sync.Once`), sets `act_runner_info`, `act_runner_capacity`, registers uptime + running-jobs GaugeFuncs, then starts the server goroutine with the daemon context — it shuts down cleanly on `ctx.Done()`. 2. `poller.fetchTask` observes RPC latency / result / error counters. `DeadlineExceeded` (long-poll idle) is treated as an empty result and not observed into the histogram so the 5s timeout doesn't swamp the buckets. 3. `poller.pollOnce` reports `poll_backoff_seconds` using the pre-jitter base interval (the true backoff level), and only when it changes — prevents noisy no-op gauge updates at the `FetchIntervalMax` plateau. 4. `reporter.ReportLog` / `ReportState` record duration histograms and success/error counters; `log_buffer_rows` is updated only when the value changes, guarded by the already-held `clientM`. 5. `runner.Run` observes `job_duration_seconds` and increments `job_total` by outcome via `metrics.ResultToStatusLabel`. ### Safety / security review - All timeouts set; Slowloris-safe. - Custom `prometheus.NewRegistry()` — no global registration side-effects. - No sensitive data in labels (reviewed every instrumentation site). - Single new dependency: `github.com/prometheus/client_golang v1.23.2`. - Endpoint is unauthenticated by design and documented as such; default localhost bind mitigates exposure. Operators exposing externally should front it with a reverse proxy. ## Verification ### Unit tests \`\`\`bash go build ./... go vet ./... go test ./... \`\`\` ### Manual smoke test 1. Enable metrics in `config.yaml`: \`\`\`yaml metrics: enabled: true addr: 127.0.0.1:9101 \`\`\` 2. Start the runner against a Gitea instance: \`./act_runner daemon\`. 3. Scrape the endpoint: \`\`\`bash curl -s http://127.0.0.1:9101/metrics \| grep '^act_runner_' curl -s http://127.0.0.1:9101/healthz # → ok \`\`\` 4. Confirm the static series appear immediately: \`act_runner_info\`, \`act_runner_capacity\`, \`act_runner_uptime_seconds\`, \`act_runner_job_running\`, \`act_runner_job_capacity_utilization_ratio\`. 5. Trigger a workflow and confirm counters increment: \`act_runner_poll_fetch_total{result=\"task\"}\`, \`act_runner_job_total{status=\"success\"}\`, \`act_runner_report_log_total{result=\"success\"}\`. 6. Leave the runner idle and confirm \`act_runner_poll_backoff_seconds\` settles (and does not churn on every poll). 7. Ctrl-C and confirm a clean \"metrics server shutdown\" log line (no port-in-use error on restart within 5s). ### Prometheus integration Add to \`prometheus.yml\`: \`\`\`yaml scrape_configs: - job_name: act_runner static_configs: - targets: ['127.0.0.1:9101'] \`\`\` Sample alert to try: \`\`\` sum(rate(act_runner_client_errors_total[5m])) by (method) > 0.1 \`\`\` ## Out of scope (follow-ups) - TLS and auth on the metrics endpoint (mitigated today by localhost default; add when operators need external scraping). - Per-task labels (intentionally avoided for cardinality safety). --- 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://gitea.com/gitea/act_runner/pulls/820 Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com> Co-authored-by: Bo-Yi Wu <appleboy.tw@gmail.com> Co-committed-by: Bo-Yi Wu <appleboy.tw@gmail.com>	2026-04-15 01:27:34 +00:00
Bo-Yi Wu	f2d545565f	perf: reduce runner-to-server connection load with adaptive reporting and polling (#819 ) ## Summary Many teams self-host Gitea + Act Runner at scale. The current runner design causes excessive HTTP requests to the Gitea server, leading to high server load. This PR addresses three root causes: aggressive fixed-interval polling, per-task status reporting every 1 second regardless of activity, and unoptimized HTTP client configuration. ## Problem The original architecture has these issues: 1. Fixed 1-second reporting interval (RunDaemon) - Every running task calls ReportLog + ReportState every 1 second (2 HTTP requests/sec/task) - These requests are sent even when there are no new log rows or state changes - With 200 runners × 3 tasks each = 1,200 req/sec just for status reporting 2. Fixed 2-second polling interval (no backoff) - Idle runners poll FetchTask every 2 seconds forever, even when no jobs are queued - No exponential backoff or jitter — all runners can synchronize after network recovery (thundering herd) - 200 idle runners = 100 req/sec doing nothing useful 3. HTTP client not tuned - Uses http.DefaultClient with MaxIdleConnsPerHost=2, causing frequent TCP/TLS reconnects - Creates two separate http.Client instances (one for Ping, one for Runner service) instead of sharing Total: ~1,300 req/sec for 200 runners with 3 tasks each ## Solution ### Adaptive Event-Driven Log Reporting Replace the recursive `time.AfterFunc(1s)` pattern in RunDaemon with a goroutine-based select event loop using three trigger mechanisms: \| Trigger \| Default \| Purpose \| \|---------\|---------\|---------\| \| `log_report_max_latency` \| 3s \| Guarantee even a single log line is delivered within this time \| \| `log_report_interval` \| 5s \| Periodic sweep — steady-state cadence \| \| `log_report_batch_size` \| 100 rows \| Immediate flush during bursty output (e.g., npm install) \| Key design: `log_report_max_latency` (3s) must be less than `log_report_interval` (5s) so the max-latency timer fires before the periodic ticker for single-line scenarios. State reporting is decoupled to its own `state_report_interval` (default 5s), with immediate flush on step transitions (start/stop) via a stateNotify channel for responsive frontend UX. Additionally: - Skip ReportLog when `len(rows) == 0` (no pending log rows) - Skip ReportState when `stateChanged == false && len(outputs) == 0` (nothing changed) - Move expensive `proto.Clone` after the early-return check to avoid deep copies on no-op paths ### Polling Backoff with Jitter Replace fixed `rate.Limiter` with adaptive exponential backoff: - Track `consecutiveEmpty` and `consecutiveErrors` counters - Interval doubles with each empty/error response: `base × 2^(n-1)`, capped at `fetch_interval_max` (default 60s) - Add ±20% random jitter to prevent thundering herd - Fetch first, sleep after �� preserves burst=1 behavior for immediate first fetch on startup and after task completion ### HTTP Client Tuning - Configure custom `http.Transport` with `MaxIdleConnsPerHost=10` (was 2) - Share a single `http.Client` between PingService and RunnerService - Add `IdleConnTimeout=90s` for clean connection lifecycle ## Load Reduction For 200 runners × 3 tasks (70% with active log output): \| Component \| Before \| After \| Reduction \| \|-----------\|--------\|-------\|-----------\| \| Polling (idle) \| 100 req/s \| ~3.4 req/s \| 97% \| \| Log reporting \| 420 req/s \| ~84 req/s \| 80% \| \| State reporting \| 126 req/s \| ~25 req/s \| 80% \| \| Total \| ~1,300 req/s \| ~113 req/s \| ~91% \| ## Frontend UX Impact \| Scenario \| Before \| After \| Notes \| \|----------\|--------\|-------\|-------\| \| Continuous output (npm install) \| ~1s \| ~5s \| Periodic ticker sweep \| \| Single line then silence \| ~1s \| ≤3s \| maxLatencyTimer guarantee \| \| Bursty output (100+ lines) \| ~1s \| <1s \| Batch size immediate flush \| \| Step start/stop \| ~1s \| <1s \| stateNotify immediate flush \| \| Job completion \| ~1s \| ~1s \| Close() retry unchanged \| ## New Configuration Options All have safe defaults — existing config files need no changes: ```yaml runner: fetch_interval_max: 60s # Max backoff interval when idle log_report_interval: 5s # Periodic log flush interval log_report_max_latency: 3s # Max time a log row waits (must be < log_report_interval) log_report_batch_size: 100 # Immediate flush threshold state_report_interval: 5s # State flush interval (step transitions are always immediate) ``` Config validation warns on invalid combinations: - `fetch_interval_max < fetch_interval` → auto-corrected - `log_report_max_latency >= log_report_interval` → warning (timer would be redundant) ## Test Plan - [x] `go build ./...` passes - [x] `go test ./...` passes (all existing + 3 new tests) - [x] `golangci-lint run` — 0 issues - [x] TestReporter_MaxLatencyTimer — verifies single log line flushed by maxLatencyTimer before logTicker - [x] TestReporter_BatchSizeFlush — verifies batch size threshold triggers immediate flush - [x] TestReporter_StateNotifyFlush — verifies step transition triggers immediate state flush - [x] TestReporter_EphemeralRunnerDeletion — verifies Close/RunDaemon race safety - [x] TestReporter_RunDaemonClose_Race — verifies concurrent Close safety Reviewed-on: https://gitea.com/gitea/act_runner/pulls/819 Reviewed-by: Nicolas <173651+bircni@noreply.gitea.com> Co-authored-by: Bo-Yi Wu <appleboy.tw@gmail.com> Co-committed-by: Bo-Yi Wu <appleboy.tw@gmail.com>	2026-04-14 11:29:25 +00:00
Lunny Xiao	90c1275f0e	Upgrade yaml (#816 ) ~wait https://gitea.com/gitea/act/pulls/157~ Reviewed-on: https://gitea.com/gitea/act_runner/pulls/816 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>	2026-03-28 16:18:47 +00:00
Zettat123	505907eb2a	Add `run_attempt` to context (#632 ) Blocked by https://gitea.com/gitea/act/pulls/126 Fix https://github.com/go-gitea/gitea/issues/33135 --------- Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com> Reviewed-on: https://gitea.com/gitea/act_runner/pulls/632 Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com> Co-authored-by: Zettat123 <zettat123@gmail.com> Co-committed-by: Zettat123 <zettat123@gmail.com>	2026-03-26 20:07:22 +00:00
silverwind	9933ea0d92	feat: add configurable bind_workdir option with workspace cleanup for DinD setups (#810 ) ## Summary Adds a `container.bind_workdir` config option that exposes the nektos/act `BindWorkdir` setting. When enabled, workspaces are bind-mounted from the host filesystem instead of Docker volumes, which is required for DinD setups where jobs use `docker compose` with bind mounts (e.g. `.:/app`). Each job gets an isolated workspace at `/workspace/<task_id>/<owner>/<repo>` to prevent concurrent jobs from the same repo interfering with each other. The task directory is cleaned up after job execution. ### Configuration ```yaml container: bind_workdir: true ``` When using this with DinD, also mount the workspace parent into the runner container and add it to `valid_volumes`: ```yaml container: valid_volumes: - /workspace/** ``` This PR was authored by Claude (AI assistant) Reviewed-on: https://gitea.com/gitea/act_runner/pulls/810 Reviewed-by: ChristopherHX <38043+christopherhx@noreply.gitea.com> Co-authored-by: silverwind <me@silverwind.io> Co-committed-by: silverwind <me@silverwind.io>	2026-03-03 10:15:06 +00:00
RoboMagus	5dd5436169	Semver tags for Docker images (#720 ) The main Gitea docker image is already distributed with proper semver tags, such that users can pin to e.g. the minor version and still pull in patch releases. This is something that has been lacking on the act runner images. This PR expands the docker image tag versioning strategy such that when `v0.2.13` is released the following image tags are produced: Basic: - `0` - `0.2` - `0.2.13` - `latest` DinD: - `0-dind` - `0.2-dind` - `0.2.13-dind` - `latest-dind` DinD-Rootless: - `0-dind-rootless` - `0.2-dind-rootless` - `0.2.13-dind-rootless` - `latest-dind-rootless` To verify the `docker/metadata-action` produces the expected results in a Gitea workflow environment I've executed a release workflow. Results can be seen in [this run](https://gitea.com/RoboMagus/gitea_act_runner/actions/runs/14). (Note though that as the repository name of my fork changed from `act_runner` to `gitea_act_runner` this is reflected in the produced docker tags in this test run!) --------- Co-authored-by: RoboMagus <68224306+RoboMagus@users.noreply.github.com> Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com> Co-authored-by: techknowlogick <techknowlogick@noreply.gitea.com> Reviewed-on: https://gitea.com/gitea/act_runner/pulls/720 Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com> Co-authored-by: RoboMagus <robomagus@noreply.gitea.com> Co-committed-by: RoboMagus <robomagus@noreply.gitea.com>	2026-02-25 19:09:53 +00:00