diff --git a/backend/cmd/backend/main.go b/backend/cmd/backend/main.go index d6ecdc0..be1b997 100644 --- a/backend/cmd/backend/main.go +++ b/backend/cmd/backend/main.go @@ -266,6 +266,21 @@ func run(ctx context.Context) (err error) { ) runtimeGateway.svc = runtimeSvc + // Run a single reconciliation pass before the dev-sandbox + // bootstrap so any runtime row pointing at a vanished engine + // container (host reboot wiped /tmp/galaxy-game-state/; + // `tools/local-dev`'s `prune-broken-engines` target reaped the + // husk) is already cascaded through `markRemoved` → lobby + // `cancelled` by the time the bootstrap walks the sandbox list. + // Without this pre-tick the bootstrap would reuse the + // soon-to-be-cancelled game and force the developer into a + // second `make up` cycle to land a healthy sandbox. Failures are + // non-fatal: the periodic ticker started later catches up, and + // the worst case degrades to the legacy two-cycle recovery. + if err := runtimeSvc.Reconciler().Tick(ctx); err != nil { + logger.Warn("pre-bootstrap reconciler tick failed", zap.Error(err)) + } + if err := devsandbox.Bootstrap(ctx, devsandbox.Deps{ Users: userSvc, Lobby: lobbySvc, diff --git a/tools/local-dev/Makefile b/tools/local-dev/Makefile index a36cbbb..c0efd73 100644 --- a/tools/local-dev/Makefile +++ b/tools/local-dev/Makefile @@ -1,4 +1,4 @@ -.PHONY: help up down logs status rebuild clean psql logs-backend logs-gateway logs-mail build-engine stop-engines wait +.PHONY: help up down logs status rebuild clean psql logs-backend logs-gateway logs-mail build-engine stop-engines prune-broken-engines wait .DEFAULT_GOAL := help @@ -17,6 +17,7 @@ help: @echo " make rebuild Force rebuild of backend / gateway images and bring up" @echo " make build-engine Build the engine image $(ENGINE_IMAGE) used by the dev sandbox" @echo " make stop-engines Stop and remove only the per-game engine containers" + @echo " make prune-broken-engines Remove non-running engine containers Docker can't heal (run inside 'up')" @echo " make clean Stop everything (incl. engines) and wipe volumes + game state" @echo " make logs Tail all logs" @echo " make logs-backend Tail only the backend logs" @@ -32,7 +33,7 @@ help: @echo "Default login for the auto-provisioned dev sandbox: dev@local.test" @echo "(see BACKEND_DEV_SANDBOX_EMAIL in .env). Login code: 123456." -up: build-engine +up: build-engine prune-broken-engines $(COMPOSE) up -d --wait rebuild: build-engine @@ -70,6 +71,34 @@ stop-engines: docker rm -f $$ids >/dev/null; \ fi +# Remove engine containers Docker can no longer heal on its own. +# After a host reboot, the per-game bind-mount source under +# /tmp/galaxy-game-state/ may have been wiped (macOS clears +# /private/tmp on reboot), so `restart: unless-stopped` cannot +# revive the container — Docker refuses to start it with a missing +# bind-mount source and leaves it stuck in `exited` / `created` +# state. This target prunes the husks before `compose up`; the +# backend's pre-bootstrap reconciler tick (`backend/cmd/backend/main.go`) +# then cascades the orphan runtime row to `removed`, the lobby +# cancels the game, and the dev-sandbox bootstrap purges the +# cancelled tile and provisions a fresh sandbox in the same +# `make up` cycle. Healthy `running` / `restarting` containers are +# left intact so a long-lived sandbox survives normal up/down +# cycles. +prune-broken-engines: + @ids=""; \ + for cid in $$(docker ps -aq --filter label=$(ENGINE_LABEL) 2>/dev/null); do \ + state=$$(docker inspect -f '{{.State.Status}}' $$cid 2>/dev/null); \ + case "$$state" in \ + running|restarting) ;; \ + *) ids="$$ids $$cid";; \ + esac; \ + done; \ + if [ -n "$$ids" ]; then \ + echo "removing non-running engine containers (post-reboot cleanup):$$ids"; \ + docker rm -f $$ids >/dev/null; \ + fi + logs: $(COMPOSE) logs -f --tail=100 diff --git a/tools/local-dev/README.md b/tools/local-dev/README.md index 8597fb5..f21ec2e 100644 --- a/tools/local-dev/README.md +++ b/tools/local-dev/README.md @@ -226,6 +226,19 @@ make status docker compose ps engine containers via the `org.opencontainers.image.title= galaxy-game-engine` label. To stop them by hand without touching the rest of the stack, `make stop-engines`. +- **Engine container exits with `bind source path does not exist: + /tmp/galaxy-game-state/` after a host reboot** — macOS clears + `/private/tmp` on reboot, so the per-game state directory the + long-lived engine container bind-mounts is gone and Docker refuses + to restart it under `restart: unless-stopped`. `make up` auto-heals + this in one cycle: `prune-broken-engines` (runs as part of `up`) + removes every engine container that is not in `running` / + `restarting` state, the backend's pre-bootstrap reconciler tick + cascades the orphan runtime row to `removed`, the lobby cancels + the matching sandbox game, and the dev-sandbox bootstrap purges + the cancelled tile and provisions a fresh sandbox with a brand + new state directory. To run the cleanup by hand without restarting + the rest of the stack, `make prune-broken-engines`. - **`make up` reports a build error mentioning `pkg/cronutil`** — upstream module list drifted; copy any new `pkg//` line into the local-dev `backend.Dockerfile` / `gateway.Dockerfile` to match