Stage 17: path-conditional CI behind an aggregate gate + connector liveness probe; Grafana move-duration panel
- #10 a `changes` job path-filters unit/integration/ui; an always-running `gate` job aggregates them (success-or-skipped) and becomes the only required check - #9 deploy adds a Telegram-connector liveness probe (docker inspect: running, not restarting, stable restart count) with a VPN-handshake grace period - #1a Game-domain dashboard gains a 'Move think-time by phase (p50/p95)' panel - deploy README: branch protection now requires only CI / gate
This commit is contained in:
+117
-2
@@ -1,6 +1,6 @@
|
||||
name: CI
|
||||
|
||||
# Single gated pipeline for the test contour (Stage 16). Gitea cannot express
|
||||
# Single gated pipeline for the test contour (Stage 16/17). Gitea cannot express
|
||||
# cross-workflow `needs`, so the full test suite and the auto test-deploy live in
|
||||
# one workflow.
|
||||
#
|
||||
@@ -11,6 +11,12 @@ name: CI
|
||||
# (PR or merge), so a PR into `master` is test-only; the prod deploy is a manual
|
||||
# workflow (Stage 18).
|
||||
#
|
||||
# Path-conditional jobs (Stage 17): `unit`/`integration`/`ui` run only when their
|
||||
# code changed (the `changes` job decides). Because a skipped required check would
|
||||
# block a merge under branch protection, the always-running `gate` job aggregates
|
||||
# their results and is the ONLY required status check; it passes when every
|
||||
# upstream job either succeeded or was skipped.
|
||||
#
|
||||
# Console output is kept plain (NO_COLOR + `docker compose --ansi never` +
|
||||
# `--progress plain`) so the Gitea logs stay readable.
|
||||
|
||||
@@ -21,7 +27,57 @@ on:
|
||||
branches: [development]
|
||||
|
||||
jobs:
|
||||
# changes detects which areas a PR/push touched, so the test jobs can skip when
|
||||
# irrelevant. It defaults to running everything when the diff cannot be computed.
|
||||
changes:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
outputs:
|
||||
go: ${{ steps.filter.outputs.go }}
|
||||
ui: ${{ steps.filter.outputs.ui }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Detect changed paths
|
||||
id: filter
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||
git fetch -q origin "${{ github.base_ref }}" || true
|
||||
range="origin/${{ github.base_ref }}...HEAD"
|
||||
else
|
||||
before="${{ github.event.before }}"
|
||||
if [ -z "$before" ] || [ "$before" = "0000000000000000000000000000000000000000" ] || ! git cat-file -e "${before}^{commit}" 2>/dev/null; then
|
||||
range="HEAD~1...HEAD"
|
||||
else
|
||||
range="${before}...HEAD"
|
||||
fi
|
||||
fi
|
||||
echo "comparison range: $range"
|
||||
# Default to running everything; narrow only when the diff is computable.
|
||||
go=true; ui=true
|
||||
files="$(git diff --name-only "$range" 2>/dev/null || echo __DIFF_FAILED__)"
|
||||
if [ "$files" != "__DIFF_FAILED__" ]; then
|
||||
echo "changed files:"; echo "$files"
|
||||
go=false; ui=false
|
||||
if echo "$files" | grep -qE '^(backend/|pkg/|gateway/|platform/|go\.work)'; then go=true; fi
|
||||
if echo "$files" | grep -qE '^ui/'; then ui=true; fi
|
||||
# A workflow or deploy change re-runs everything as a safety net.
|
||||
if echo "$files" | grep -qE '^(\.gitea/workflows/|deploy/)'; then go=true; ui=true; fi
|
||||
else
|
||||
echo "diff failed; running all jobs"
|
||||
fi
|
||||
echo "selected: go=$go ui=$ui"
|
||||
echo "go=$go" >> "$GITHUB_OUTPUT"
|
||||
echo "ui=$ui" >> "$GITHUB_OUTPUT"
|
||||
|
||||
unit:
|
||||
needs: changes
|
||||
if: ${{ needs.changes.outputs.go == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
@@ -67,6 +123,8 @@ jobs:
|
||||
run: go test -count=1 ./backend/... ./pkg/... ./gateway/... ./platform/telegram/...
|
||||
|
||||
integration:
|
||||
needs: changes
|
||||
if: ${{ needs.changes.outputs.go == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
@@ -102,6 +160,8 @@ jobs:
|
||||
run: go test -tags=integration -count=1 -p=1 -parallel=1 -timeout=15m ./backend/...
|
||||
|
||||
ui:
|
||||
needs: changes
|
||||
if: ${{ needs.changes.outputs.ui == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
@@ -142,10 +202,37 @@ jobs:
|
||||
run: pnpm run test:e2e
|
||||
timeout-minutes: 5
|
||||
|
||||
# gate is the single branch-protection required check. It always runs and passes
|
||||
# only when each upstream job succeeded or was skipped (a path-filtered no-op),
|
||||
# failing the merge if any actually failed or was cancelled.
|
||||
gate:
|
||||
needs: [unit, integration, ui]
|
||||
if: always()
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Aggregate required checks
|
||||
run: |
|
||||
fail=
|
||||
for r in "unit:${{ needs.unit.result }}" "integration:${{ needs.integration.result }}" "ui:${{ needs.ui.result }}"; do
|
||||
name="${r%%:*}"; res="${r#*:}"
|
||||
echo "$name = $res"
|
||||
case "$res" in
|
||||
success|skipped) ;;
|
||||
*) echo "::error::$name=$res"; fail=1 ;;
|
||||
esac
|
||||
done
|
||||
[ -z "$fail" ] || { echo "one or more required jobs failed"; exit 1; }
|
||||
echo "all required jobs passed or were skipped"
|
||||
|
||||
deploy:
|
||||
# Auto test-deploy on a PR into development and on the push that merges it.
|
||||
# A PR into master is test-only (this job is skipped); prod deploy is manual.
|
||||
needs: [unit, integration, ui]
|
||||
# Gates on `gate` (so a real test failure blocks the deploy) but runs even when
|
||||
# some test jobs were path-skipped.
|
||||
needs: [gate]
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/development') || (github.event_name == 'pull_request' && github.base_ref == 'development') }}
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
@@ -215,6 +302,34 @@ jobs:
|
||||
docker logs --tail 50 scrabble-gateway || true
|
||||
exit 1
|
||||
|
||||
- name: Probe the Telegram connector liveness
|
||||
run: |
|
||||
set -u
|
||||
# The gateway probe cannot see a crash-looping connector (it long-polls and
|
||||
# egresses through the VPN sidecar, with no public ingress). Inspect the
|
||||
# container directly: it must be running, not restarting, with a stable
|
||||
# restart count. A grace period lets the VPN handshake settle (the connector
|
||||
# may restart a few times first).
|
||||
sleep 20
|
||||
for i in $(seq 1 20); do
|
||||
status="$(docker inspect -f '{{.State.Status}}' scrabble-telegram 2>/dev/null || echo missing)"
|
||||
restarting="$(docker inspect -f '{{.State.Restarting}}' scrabble-telegram 2>/dev/null || echo true)"
|
||||
if [ "$status" = "running" ] && [ "$restarting" = "false" ]; then
|
||||
c1="$(docker inspect -f '{{.RestartCount}}' scrabble-telegram)"
|
||||
sleep 5
|
||||
c2="$(docker inspect -f '{{.RestartCount}}' scrabble-telegram)"
|
||||
if [ "$c1" = "$c2" ]; then
|
||||
echo "connector healthy: status=$status restarts=$c2"
|
||||
exit 0
|
||||
fi
|
||||
echo "connector still restarting ($c1 -> $c2); waiting"
|
||||
fi
|
||||
sleep 3
|
||||
done
|
||||
echo "connector not healthy; recent logs:"
|
||||
docker logs --tail 80 scrabble-telegram || true
|
||||
exit 1
|
||||
|
||||
- name: Prune dangling images
|
||||
if: always()
|
||||
run: docker image prune -f
|
||||
|
||||
+5
-2
@@ -110,5 +110,8 @@ resolves both `otelcol` and `api.telegram.org`. `GATEWAY_ADMIN_*` is intentional
|
||||
- **Host caddy** route `<domain> → scrabble:80` (the in-compose caddy serves HTTP
|
||||
in the test contour; the host caddy terminates TLS). Not needed on prod, where the
|
||||
contour caddy owns TLS (set `CADDY_SITE_ADDRESS` to the domain).
|
||||
- **Branch protection** required-status-check names are `CI / unit`,
|
||||
`CI / integration`, `CI / ui` (see [`../CLAUDE.md`](../CLAUDE.md) "Branching & CI").
|
||||
- **Branch protection** requires the single status check `CI / gate` (Stage 17).
|
||||
The `unit` / `integration` / `ui` jobs are path-conditional (they skip when their
|
||||
code did not change), and the always-running `gate` job aggregates them (passing
|
||||
when each succeeded or was skipped), so a skipped job never blocks a merge. See
|
||||
[`../CLAUDE.md`](../CLAUDE.md) "Branching & CI".
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
"tags": ["scrabble"],
|
||||
"timezone": "",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
@@ -54,6 +54,18 @@
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "histogram_quantile(0.95, sum(rate(game_move_validate_duration_bucket[5m])) by (le, variant))", "legendFormat": "{{variant}}" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Move think-time by phase (p50 / p95)",
|
||||
"description": "Seconds a seat spent on a committed move, by game phase. Aggregates all seats including robots; per-human analysis is in the admin console.",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
||||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "A", "expr": "histogram_quantile(0.5, sum(rate(game_move_duration_bucket[15m])) by (le, phase))", "legendFormat": "p50 {{phase}}" },
|
||||
{ "refId": "B", "expr": "histogram_quantile(0.95, sum(rate(game_move_duration_bucket[15m])) by (le, phase))", "legendFormat": "p95 {{phase}}" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user