diff --git a/.gitea/workflows/dev-deploy.yaml b/.gitea/workflows/dev-deploy.yaml index 5df15df..2a2ef6b 100644 --- a/.gitea/workflows/dev-deploy.yaml +++ b/.gitea/workflows/dev-deploy.yaml @@ -148,14 +148,80 @@ jobs: -v "${{ gitea.workspace }}/pkg/geoip/test-data/test-data:/src:ro" \ alpine sh -c 'cp /src/GeoIP2-Country-Test.mmdb /dst/geoip.mmdb' + - name: Recycle engine containers on image drift + run: | + # Compare the freshly-built `galaxy-engine:dev` SHA against + # every running `galaxy-game-*` container. The backend + # reconciler adopts pre-existing labelled engine containers + # without checking image drift, so a running sandbox would + # otherwise keep serving the previous engine code until the + # container is recycled by hand. This step makes the recycle + # automatic but only when it is actually needed: + # + # * BuildKit cache hit on the `Build galaxy-engine image` + # step → `galaxy-engine:dev` keeps its previous SHA → + # no drift → no-op (no engine source change to deploy). + # * engine source change → fresh SHA → for each drifted + # container we stop the backend, remove the container, + # wipe its bind-mounted state directory (Engine.Init() + # writes turn-0 over any pre-existing `turn-N` files — + # silent state corruption otherwise), and cascade-delete + # the lobby `games` row (the FKs in `00001_init.sql` + # drop the matching `runtime_records`, `memberships`, + # `player_mappings`, etc. in the same write). The + # `dev-sandbox` bootstrap on the next backend boot finds + # no live sandbox and provisions a fresh one on the new + # engine image. + # + # Backend is stopped first to keep the reconciler from + # racing the recycle (mid-stream adoption / restart). The + # subsequent `Bring up the stack` step restarts it. + set -u + new_sha=$(docker image inspect galaxy-engine:dev --format '{{.Id}}') + echo "fresh galaxy-engine:dev = $new_sha" + + drift=() + for c in $(docker ps --filter "name=galaxy-game-" --format '{{.Names}}'); do + cur=$(docker inspect "$c" --format '{{.Image}}') + if [ "$cur" != "$new_sha" ]; then + drift+=("${c#galaxy-game-}") + echo " drift: $c was on $cur" + else + echo " match: $c" + fi + done + if [ ${#drift[@]} -eq 0 ]; then + echo "no drift detected — recycle skipped" + else + docker stop -t 30 galaxy-dev-backend >/dev/null 2>&1 || true + state_root="$HOME/.galaxy-dev/game-state" + for gid in "${drift[@]}"; do + echo "recycling $gid" + docker rm -f "galaxy-game-$gid" >/dev/null 2>&1 || true + # Wipe the per-game state dir as root inside a throwaway + # container so we can remove files left behind by the + # engine container even when its uid differs from the + # runner's. + docker run --rm -v "$state_root:/state" alpine \ + sh -c "rm -rf -- /state/$gid" + done + ids_csv=$(printf "'%s'," "${drift[@]}") + ids_csv=${ids_csv%,} + docker exec galaxy-dev-postgres psql -v ON_ERROR_STOP=1 \ + -U galaxy -d galaxy_backend \ + -c "DELETE FROM backend.games WHERE game_id IN (${ids_csv});" + fi + - name: Reap stray dev-deploy containers run: | # Remove any non-running compose-managed containers from # earlier deploys before `compose up`. Filter by the stack # label so we never touch unrelated workloads on the same - # daemon. Running containers (incl. engine instances backend - # spawned itself with the same label) are left intact — - # those are reattached by the backend reconciler on boot. + # daemon. Running engine containers spawned by backend with + # the same label are left intact when their image SHA still + # matches the freshly-built `galaxy-engine:dev` (handled by + # the preceding `Recycle engine containers on image drift` + # step); the reconciler reattaches them on backend boot. ids=$(docker ps -aq \ --filter "label=galaxy.stack=dev-deploy" \ --filter "status=exited" \ diff --git a/tools/dev-deploy/README.md b/tools/dev-deploy/README.md index 8b3fd13..2a7a9fe 100644 --- a/tools/dev-deploy/README.md +++ b/tools/dev-deploy/README.md @@ -235,6 +235,29 @@ The deploy is idempotent — when the PR later merges into healthcheck steps, overwriting whatever the manual dispatch left behind. There is no separate state to clean up between the two paths. +### Engine image drift recycle + +`backend` spawns one engine container per game (the long-lived "Dev +Sandbox" plus any user-created games) and the reconciler reattaches +to whatever it finds with the `galaxy.stack=dev-deploy` label. That +reattach does not check the running container's image SHA against the +freshly-built `galaxy-engine:dev` tag, so an unchanged container would +otherwise keep serving the previous engine code after a redeploy. + +The `dev-deploy.yaml` workflow handles this in the +`Recycle engine containers on image drift` step. When `docker build` +produces a new `galaxy-engine:dev` SHA, the step compares it against +every running `galaxy-game-*` container and, for each drifted one, +stops the backend, removes the container, wipes its bind-mounted +state directory (Engine.Init() writes turn-0 over any pre-existing +`turn-N` files), and cascade-deletes the lobby `games` row. The +`dev-sandbox` bootstrap on the next backend boot finds no live +sandbox and provisions a fresh one on the new engine image. + +When the engine sources are unchanged, the BuildKit cache hits and +the SHA stays the same — the recycle step is a no-op and the running +games keep their state across the deploy. + ## Relationship to other infrastructure - `tools/local-dev/` — single-developer playground, host-port mapped,