diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 1947197..275bc9b 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -839,8 +839,8 @@ and never touch unrelated workloads on the shared daemon. | Label | Values | Set by | Used by | |-------|--------|--------|---------| -| `galaxy.stack` | `local-dev`, `dev-deploy`, `integration` | `tools/{local-dev,dev-deploy}/docker-compose.yml` for compose-managed services; backend reads `BACKEND_STACK_LABEL` and stamps engines it spawns. | `tools/{local-dev,dev-deploy}/Makefile`, `.gitea/workflows/dev-deploy.yaml`. | -| `galaxy.backend` | `1` | `backend/internal/dockerclient` adapter on every engine container. | `integration/scripts/preclean.sh`. | +| `galaxy.stack` | `local-dev`, `dev-deploy`, `integration` | `tools/{local-dev,dev-deploy}/docker-compose.yml` for compose-managed services; backend reads `BACKEND_STACK_LABEL` and stamps engines it spawns. `integration/testenv/backend.go` passes `integration` to every backend-under-test. | `tools/{local-dev,dev-deploy}/Makefile`, `.gitea/workflows/dev-deploy.yaml`, `integration/scripts/preclean.sh`. | +| `galaxy.backend` | `1` | `backend/internal/dockerclient` adapter on every engine container. | `integration/scripts/preclean.sh` — AND-combined with `galaxy.stack=integration` to leave dev-deploy / local-dev engines untouched. | | `galaxy.game_id` | `` | Backend on engine create. | Reconciler reattach loop. | | `galaxy.engine_version` | `` | Backend on engine create. | Reconciler version checks. | | `galaxy.test.kind` | `integration-image` | `integration/testenv/images.go` on local image builds. | `integration/scripts/preclean.sh` (filter for `docker rmi`). | diff --git a/integration/scripts/preclean.sh b/integration/scripts/preclean.sh index f9c2da2..ee8312c 100755 --- a/integration/scripts/preclean.sh +++ b/integration/scripts/preclean.sh @@ -7,11 +7,15 @@ # 1. Containers labelled `org.testcontainers=true` — every container # brought up by testcontainers-go (our backend/gateway/game plus # postgres/redis/mailpit/ryuk service containers). -# 2. Containers labelled `galaxy.backend=1` — engine instances spawned -# by backend's runtime adapter on the host Docker daemon (see -# `backend/internal/dockerclient/types.go`). These do not carry -# the testcontainers label because backend, not testcontainers, -# creates them. +# 2. Containers labelled `galaxy.backend=1` AND +# `galaxy.stack=integration` — engine instances spawned by the +# backend-under-test on the host Docker daemon (see +# `backend/internal/dockerclient/types.go` and the +# `BACKEND_STACK_LABEL=integration` env in +# `integration/testenv/backend.go`). The stack-label filter is +# what keeps dev-deploy / local-dev engines on the same host +# safe — they carry `galaxy.backend=1` too but a different +# `galaxy.stack` value, so the AND match leaves them alone. # 3. Networks labelled `org.testcontainers=true` — networks created # by testcontainers-go for cross-container wiring. # 4. Images labelled `galaxy.test.kind=integration-image` — local @@ -22,14 +26,21 @@ # What we never touch: # - Containers / images without one of the labels above. # - User-managed images and volumes. +# - dev-deploy / local-dev engines (they share the `galaxy.backend=1` +# label, but their `galaxy.stack` value differs from `integration`). set -euo pipefail remove_containers_with_label() { - local label="$1" - local description="$2" + local description="${!#}" + local labels=("${@:1:$#-1}") + local filter_args=() + local label + for label in "${labels[@]}"; do + filter_args+=("--filter" "label=$label") + done local ids - ids=$(docker ps -aq --filter "label=$label" 2>/dev/null || true) + ids=$(docker ps -aq "${filter_args[@]}" 2>/dev/null || true) if [ -z "$ids" ]; then return fi @@ -81,7 +92,7 @@ if ! docker info >/dev/null 2>&1; then fi remove_containers_with_label "org.testcontainers=true" "testcontainers-managed containers" -remove_containers_with_label "galaxy.backend=1" "backend-managed engine containers" +remove_containers_with_label "galaxy.backend=1" "galaxy.stack=integration" "integration-owned engine containers" remove_networks_with_label "org.testcontainers=true" "testcontainers-managed networks" remove_images_with_label "galaxy.test.kind=integration-image" "integration-built images" diff --git a/integration/testenv/backend.go b/integration/testenv/backend.go index 644ed77..adc3055 100644 --- a/integration/testenv/backend.go +++ b/integration/testenv/backend.go @@ -85,6 +85,13 @@ func StartBackend(t *testing.T, opts BackendOptions) *BackendContainer { "BACKEND_AUTH_CHALLENGE_THROTTLE_MAX": "100", "BACKEND_MAIL_WORKER_INTERVAL": "500ms", "BACKEND_NOTIFICATION_WORKER_INTERVAL": "500ms", + // Stamp galaxy.stack=integration on every engine container the + // backend-under-test spawns so the post-run preclean.sh can + // scope its cleanup to integration-owned engines and leave + // dev-deploy / local-dev stacks running on the same daemon + // untouched. See `integration/scripts/preclean.sh` and the + // "Container labels" section in `docs/ARCHITECTURE.md`. + "BACKEND_STACK_LABEL": "integration", } for k, v := range opts.Extra { env[k] = v diff --git a/tools/dev-deploy/KNOWN-ISSUES.md b/tools/dev-deploy/KNOWN-ISSUES.md index 32ab2d5..f1f1aad 100644 --- a/tools/dev-deploy/KNOWN-ISSUES.md +++ b/tools/dev-deploy/KNOWN-ISSUES.md @@ -103,14 +103,42 @@ deliberate next reproduction with `docker events --since 0` armed end-to-end on the dev host, not just the chunk after backend recreate — would pin which step emits the `destroy` on the engine. +### Update 2026-05-19: integration preclean identified as one cause + +A live reproduction during the post-merge auto-deploy cycle (Gitea +run #188 dev-deploy plus parallel run #190 integration) pinned one +clobbering source: `integration/scripts/preclean.sh` was unscoped +and removed *every* container labelled `galaxy.backend=1`, including +the dev-deploy engine. Timeline from the dev host: + +```text +23:10:40 backend pre-bootstrap reconciler tick: engine alive +23:10:40 dev_sandbox bootstrap: status=running +23:10:56 preclean: removing 1 backend-managed engine containers ← integration run #190 +23:11:40 reconciler: container disappeared → game cancelled +``` + +Fix landed: `BACKEND_STACK_LABEL=integration` is now passed to +every integration backend (see +`integration/testenv/backend.go`) and `preclean.sh` AND-combines +`galaxy.backend=1` with `galaxy.stack=integration`, so dev-deploy / +local-dev engines stamped with different stack values are no longer +collateral. + +This covers **push**-triggered cycles where `dev-deploy.yaml` and +`integration.yaml` run on the same Gitea host. The original +hypothesis (a `workflow_dispatch dev-deploy` solo run also losing +the engine) is *not* explained by the integration fix — manual +dispatches do not trigger `integration.yaml`. Keep this entry open +until a solo-dispatch reproduction confirms whether the symptom +still occurs. + ### Status -Parked. The bug is mildly disruptive (one redispatch + a manual -`make seed-ui`-style follow-up brings the sandbox back) and the -remaining hypotheses are speculative. If the symptom recurs, attach -the next bad-window `docker events` capture to this entry and -reopen. A `tools/dev-deploy/` rewrite may obviate the issue -entirely; that is on the project owner's medium-term list. +Partially fixed (push-triggered cycles). Solo `workflow_dispatch` +reproductions still open. If the symptom recurs after the +integration fix lands, capture `docker events --since 0` for the +full dispatch window and attach here. ### Workaround in use today @@ -131,3 +159,49 @@ row, `findOrCreateSandboxGame` creates a fresh one, and Unassigned. File an issue once we have the runtime / reconciler analysis above; reference this section in the issue body so future redeploys can short-circuit the diagnostic loop. + +## `docker restart galaxy-dev-backend` fails after the CI runner cleans up + +### Symptom + +`docker restart galaxy-dev-backend` from the host fails with: + +```text +Error response from daemon: ... error mounting +"/home/runner/.cache/act//hostexecutor/pkg/geoip/test-data/test-data/GeoIP2-Country-Test.mmdb" +to rootfs at "/var/lib/galaxy/geoip.mmdb": ... not a directory +``` + +The container ends up `Exited (127)` and never comes back. + +### Cause + +`tools/dev-deploy/docker-compose.yml` mounts the geoip database via +a path relative to the compose file +(`../../pkg/geoip/test-data/test-data/GeoIP2-Country-Test.mmdb`). When +the `dev-deploy.yaml` Gitea runner invokes `docker compose up` it +resolves that relative path against the runner's ephemeral workspace +under `/home/runner/.cache/act//hostexecutor/tools/dev-deploy/`, +so the bind-mount source baked into the running container points at +that ephemeral path. The runner deletes the workspace once the +workflow ends, the source disappears, and the next `docker restart` +fails to remount it. + +### Workaround + +Bring the stack back up from a stable workspace, which re-binds the +mount source to the persistent checkout: + +```sh +make -C tools/dev-deploy up +``` + +This restarts every service (including the broken `galaxy-dev-backend`) +with a stable source path. + +### Status + +Open. The clean fix is either to bake the geoip test fixture into +the backend image (no host bind-mount) or to copy it onto a named +volume during `dev-deploy.yaml` and bind that instead. Either change +removes the runner-workspace dependency entirely.