diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml index bfb321b..67d46a9 100644 --- a/.gitea/workflows/ci.yaml +++ b/.gitea/workflows/ci.yaml @@ -1,6 +1,6 @@ name: CI -# Single gated pipeline for the test contour (Stage 16). Gitea cannot express +# Single gated pipeline for the test contour (Stage 16/17). Gitea cannot express # cross-workflow `needs`, so the full test suite and the auto test-deploy live in # one workflow. # @@ -11,6 +11,12 @@ name: CI # (PR or merge), so a PR into `master` is test-only; the prod deploy is a manual # workflow (Stage 18). # +# Path-conditional jobs (Stage 17): `unit`/`integration`/`ui` run only when their +# code changed (the `changes` job decides). Because a skipped required check would +# block a merge under branch protection, the always-running `gate` job aggregates +# their results and is the ONLY required status check; it passes when every +# upstream job either succeeded or was skipped. +# # Console output is kept plain (NO_COLOR + `docker compose --ansi never` + # `--progress plain`) so the Gitea logs stay readable. @@ -21,7 +27,57 @@ on: branches: [development] jobs: + # changes detects which areas a PR/push touched, so the test jobs can skip when + # irrelevant. It defaults to running everything when the diff cannot be computed. + changes: + runs-on: ubuntu-latest + defaults: + run: + shell: bash + outputs: + go: ${{ steps.filter.outputs.go }} + ui: ${{ steps.filter.outputs.ui }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Detect changed paths + id: filter + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + git fetch -q origin "${{ github.base_ref }}" || true + range="origin/${{ github.base_ref }}...HEAD" + else + before="${{ github.event.before }}" + if [ -z "$before" ] || [ "$before" = "0000000000000000000000000000000000000000" ] || ! git cat-file -e "${before}^{commit}" 2>/dev/null; then + range="HEAD~1...HEAD" + else + range="${before}...HEAD" + fi + fi + echo "comparison range: $range" + # Default to running everything; narrow only when the diff is computable. + go=true; ui=true + files="$(git diff --name-only "$range" 2>/dev/null || echo __DIFF_FAILED__)" + if [ "$files" != "__DIFF_FAILED__" ]; then + echo "changed files:"; echo "$files" + go=false; ui=false + if echo "$files" | grep -qE '^(backend/|pkg/|gateway/|platform/|go\.work)'; then go=true; fi + if echo "$files" | grep -qE '^ui/'; then ui=true; fi + # A workflow or deploy change re-runs everything as a safety net. + if echo "$files" | grep -qE '^(\.gitea/workflows/|deploy/)'; then go=true; ui=true; fi + else + echo "diff failed; running all jobs" + fi + echo "selected: go=$go ui=$ui" + echo "go=$go" >> "$GITHUB_OUTPUT" + echo "ui=$ui" >> "$GITHUB_OUTPUT" + unit: + needs: changes + if: ${{ needs.changes.outputs.go == 'true' }} runs-on: ubuntu-latest defaults: run: @@ -67,6 +123,8 @@ jobs: run: go test -count=1 ./backend/... ./pkg/... ./gateway/... ./platform/telegram/... integration: + needs: changes + if: ${{ needs.changes.outputs.go == 'true' }} runs-on: ubuntu-latest defaults: run: @@ -102,6 +160,8 @@ jobs: run: go test -tags=integration -count=1 -p=1 -parallel=1 -timeout=15m ./backend/... ui: + needs: changes + if: ${{ needs.changes.outputs.ui == 'true' }} runs-on: ubuntu-latest defaults: run: @@ -142,10 +202,37 @@ jobs: run: pnpm run test:e2e timeout-minutes: 5 + # gate is the single branch-protection required check. It always runs and passes + # only when each upstream job succeeded or was skipped (a path-filtered no-op), + # failing the merge if any actually failed or was cancelled. + gate: + needs: [unit, integration, ui] + if: always() + runs-on: ubuntu-latest + defaults: + run: + shell: bash + steps: + - name: Aggregate required checks + run: | + fail= + for r in "unit:${{ needs.unit.result }}" "integration:${{ needs.integration.result }}" "ui:${{ needs.ui.result }}"; do + name="${r%%:*}"; res="${r#*:}" + echo "$name = $res" + case "$res" in + success|skipped) ;; + *) echo "::error::$name=$res"; fail=1 ;; + esac + done + [ -z "$fail" ] || { echo "one or more required jobs failed"; exit 1; } + echo "all required jobs passed or were skipped" + deploy: # Auto test-deploy on a PR into development and on the push that merges it. # A PR into master is test-only (this job is skipped); prod deploy is manual. - needs: [unit, integration, ui] + # Gates on `gate` (so a real test failure blocks the deploy) but runs even when + # some test jobs were path-skipped. + needs: [gate] if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/development') || (github.event_name == 'pull_request' && github.base_ref == 'development') }} runs-on: ubuntu-latest defaults: @@ -215,6 +302,34 @@ jobs: docker logs --tail 50 scrabble-gateway || true exit 1 + - name: Probe the Telegram connector liveness + run: | + set -u + # The gateway probe cannot see a crash-looping connector (it long-polls and + # egresses through the VPN sidecar, with no public ingress). Inspect the + # container directly: it must be running, not restarting, with a stable + # restart count. A grace period lets the VPN handshake settle (the connector + # may restart a few times first). + sleep 20 + for i in $(seq 1 20); do + status="$(docker inspect -f '{{.State.Status}}' scrabble-telegram 2>/dev/null || echo missing)" + restarting="$(docker inspect -f '{{.State.Restarting}}' scrabble-telegram 2>/dev/null || echo true)" + if [ "$status" = "running" ] && [ "$restarting" = "false" ]; then + c1="$(docker inspect -f '{{.RestartCount}}' scrabble-telegram)" + sleep 5 + c2="$(docker inspect -f '{{.RestartCount}}' scrabble-telegram)" + if [ "$c1" = "$c2" ]; then + echo "connector healthy: status=$status restarts=$c2" + exit 0 + fi + echo "connector still restarting ($c1 -> $c2); waiting" + fi + sleep 3 + done + echo "connector not healthy; recent logs:" + docker logs --tail 80 scrabble-telegram || true + exit 1 + - name: Prune dangling images if: always() run: docker image prune -f diff --git a/deploy/README.md b/deploy/README.md index 62ab89d..b5778f4 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -110,5 +110,8 @@ resolves both `otelcol` and `api.telegram.org`. `GATEWAY_ADMIN_*` is intentional - **Host caddy** route ` → scrabble:80` (the in-compose caddy serves HTTP in the test contour; the host caddy terminates TLS). Not needed on prod, where the contour caddy owns TLS (set `CADDY_SITE_ADDRESS` to the domain). -- **Branch protection** required-status-check names are `CI / unit`, - `CI / integration`, `CI / ui` (see [`../CLAUDE.md`](../CLAUDE.md) "Branching & CI"). +- **Branch protection** requires the single status check `CI / gate` (Stage 17). + The `unit` / `integration` / `ui` jobs are path-conditional (they skip when their + code did not change), and the always-running `gate` job aggregates them (passing + when each succeeded or was skipped), so a skipped job never blocks a merge. See + [`../CLAUDE.md`](../CLAUDE.md) "Branching & CI". diff --git a/deploy/grafana/dashboards/game-domain.json b/deploy/grafana/dashboards/game-domain.json index 90d76f9..53594c2 100644 --- a/deploy/grafana/dashboards/game-domain.json +++ b/deploy/grafana/dashboards/game-domain.json @@ -4,7 +4,7 @@ "tags": ["scrabble"], "timezone": "", "schemaVersion": 39, - "version": 1, + "version": 2, "refresh": "30s", "time": { "from": "now-24h", "to": "now" }, "panels": [ @@ -54,6 +54,18 @@ "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "targets": [{ "refId": "A", "expr": "histogram_quantile(0.95, sum(rate(game_move_validate_duration_bucket[5m])) by (le, variant))", "legendFormat": "{{variant}}" }] + }, + { + "type": "timeseries", + "title": "Move think-time by phase (p50 / p95)", + "description": "Seconds a seat spent on a committed move, by game phase. Aggregates all seats including robots; per-human analysis is in the admin console.", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "A", "expr": "histogram_quantile(0.5, sum(rate(game_move_duration_bucket[15m])) by (le, phase))", "legendFormat": "p50 {{phase}}" }, + { "refId": "B", "expr": "histogram_quantile(0.95, sum(rate(game_move_duration_bucket[15m])) by (le, phase))", "legendFormat": "p95 {{phase}}" } + ] } ] }