f70258849f
`docker restart galaxy-dev-backend` failed with "not a directory"
after every dev-deploy workflow run. Root cause: the compose file
bind-mounted the geoip database via a relative path
(`../../pkg/geoip/test-data/test-data/GeoIP2-Country-Test.mmdb`).
When the Gitea runner invoked `docker compose up`, the path
resolved against the runner's ephemeral workspace under
`/home/runner/.cache/act/<hash>/hostexecutor/...`. The bind source
baked into the running container therefore pointed at that
ephemeral path; the runner deleted the workspace once the workflow
finished, and any later `docker restart` could not remount.
Replace the bind with a named volume `galaxy-dev-geoip-data`,
seeded at deploy time:
- `tools/dev-deploy/docker-compose.yml`: mount
`galaxy-dev-geoip-data:/var/lib/galaxy:ro` instead of a relative
bind. Declare the volume in the top-level `volumes:` block.
- `.gitea/workflows/dev-deploy.yaml`: new `Seed geoip volume` step
(placed right after the existing UI-volume seed) copies the
fixture from `pkg/geoip/test-data/test-data/` into the named
volume via an ephemeral alpine container, the same pattern UI
seeding already uses.
- `tools/dev-deploy/Makefile`: new `seed-geoip` target performs
the same copy from the persistent checkout. `up` and `rebuild`
now depend on it, so a hand-run `make -C tools/dev-deploy up`
populates the volume without operator action.
- `tools/dev-deploy/README.md`: updated the make-targets table to
list `seed-geoip`.
- `tools/dev-deploy/KNOWN-ISSUES.md`: the entry for the restart
failure is downgraded to a "fixed" postmortem; the symptom,
cause, and where the fix lives are kept for future reference.
Verification on the dev host (this branch checked out):
$ make -C tools/dev-deploy up # populates the volume, brings stack healthy
$ docker restart galaxy-dev-backend # used to error "not a directory"
$ until [ "$(docker inspect -f '{{.State.Health.Status}}' galaxy-dev-backend)" = "healthy" ]; do sleep 2; done
$ echo "ok" # backend up 6s, healthy
The pre-existing sandbox engine `galaxy-game-80f3ce86-...` survived
both `make up` and `docker restart` untouched.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
271 lines
11 KiB
YAML
271 lines
11 KiB
YAML
# Long-lived dev environment for the Galaxy stack, deployed by the
|
|
# `dev-deploy.yaml` Gitea Actions workflow on every merge into the
|
|
# `development` branch and (optionally) by `make -C tools/dev-deploy up`
|
|
# from a developer shell on the same host.
|
|
#
|
|
# The stack is reachable from a browser only through the host Caddy on
|
|
# the machine, which terminates TLS and forwards `*.galaxy.lan` into the
|
|
# external `edge` Docker network where `galaxy-caddy` does app-routing.
|
|
# No service in this compose project binds a host port — coexistence
|
|
# with `tools/local-dev/` (which listens on localhost:5433/6380/8025/...)
|
|
# is achieved by distinct names, networks, and volumes.
|
|
#
|
|
# Browser → host-Caddy (:80/:443) → galaxy-caddy → {galaxy-api, /srv/galaxy-ui}
|
|
#
|
|
# Persistent state lives in named volumes under the `galaxy-dev-*`
|
|
# prefix; surviving redeploys across compose rebuilds.
|
|
|
|
name: galaxy-dev
|
|
|
|
services:
|
|
galaxy-postgres:
|
|
image: postgres:16-alpine
|
|
container_name: galaxy-dev-postgres
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
environment:
|
|
POSTGRES_USER: galaxy
|
|
POSTGRES_PASSWORD: galaxy
|
|
POSTGRES_DB: galaxy_backend
|
|
volumes:
|
|
- galaxy-dev-postgres-data:/var/lib/postgresql/data
|
|
networks:
|
|
- galaxy-internal
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U galaxy -d galaxy_backend"]
|
|
interval: 3s
|
|
timeout: 3s
|
|
retries: 30
|
|
start_period: 5s
|
|
|
|
galaxy-redis:
|
|
image: redis:7-alpine
|
|
container_name: galaxy-dev-redis
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
command:
|
|
- redis-server
|
|
- --requirepass
|
|
- galaxy-dev
|
|
- --appendonly
|
|
- "no"
|
|
- --save
|
|
- ""
|
|
networks:
|
|
- galaxy-internal
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "-a", "galaxy-dev", "PING"]
|
|
interval: 3s
|
|
timeout: 3s
|
|
retries: 30
|
|
start_period: 3s
|
|
|
|
galaxy-mailpit:
|
|
image: axllent/mailpit:v1.21
|
|
container_name: galaxy-dev-mailpit
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
networks:
|
|
- galaxy-internal
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/livez"]
|
|
interval: 3s
|
|
timeout: 3s
|
|
retries: 30
|
|
start_period: 3s
|
|
|
|
galaxy-backend:
|
|
build:
|
|
context: ../..
|
|
dockerfile: tools/local-dev/backend.Dockerfile
|
|
image: galaxy/backend:dev
|
|
container_name: galaxy-dev-backend
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
user: "0:0"
|
|
depends_on:
|
|
galaxy-postgres:
|
|
condition: service_healthy
|
|
galaxy-mailpit:
|
|
condition: service_healthy
|
|
environment:
|
|
BACKEND_LOGGING_LEVEL: info
|
|
BACKEND_HTTP_LISTEN_ADDR: ":8080"
|
|
BACKEND_GRPC_PUSH_LISTEN_ADDR: ":8081"
|
|
BACKEND_POSTGRES_DSN: "postgres://galaxy:galaxy@galaxy-postgres:5432/galaxy_backend?search_path=backend&sslmode=disable"
|
|
BACKEND_SMTP_HOST: galaxy-mailpit
|
|
BACKEND_SMTP_PORT: "1025"
|
|
BACKEND_SMTP_FROM: "galaxy-backend@galaxy.lan"
|
|
BACKEND_SMTP_TLS_MODE: none
|
|
BACKEND_DOCKER_NETWORK: galaxy-dev-internal
|
|
BACKEND_STACK_LABEL: dev-deploy
|
|
BACKEND_GAME_STATE_ROOT: ${GALAXY_DEV_GAME_STATE_DIR}
|
|
BACKEND_GEOIP_DB_PATH: /var/lib/galaxy/geoip.mmdb
|
|
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
|
|
BACKEND_MAIL_WORKER_INTERVAL: 500ms
|
|
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
|
|
BACKEND_OTEL_TRACES_EXPORTER: none
|
|
BACKEND_OTEL_METRICS_EXPORTER: none
|
|
# Long-lived dev environment always opts into the fixed-code
|
|
# override so a returning developer can sign in with `123456`
|
|
# even after the matching browser session was cleared (the real
|
|
# bcrypt-hashed code is single-use). Set the var to an empty
|
|
# string in `.env` to disable.
|
|
BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-123456}
|
|
# Long-lived dev environment always bootstraps the "Dev Sandbox"
|
|
# game owned by this email so a freshly redeployed stack already
|
|
# has one ready-to-play game in the lobby. Set the variable to an
|
|
# empty string in `.env` to disable the bootstrap (e.g. for a
|
|
# cold-start QA pass).
|
|
BACKEND_DEV_SANDBOX_EMAIL: ${BACKEND_DEV_SANDBOX_EMAIL:-dev@galaxy.lan}
|
|
BACKEND_DEV_SANDBOX_ENGINE_IMAGE: ${BACKEND_DEV_SANDBOX_ENGINE_IMAGE:-galaxy-engine:dev}
|
|
BACKEND_DEV_SANDBOX_ENGINE_VERSION: ${BACKEND_DEV_SANDBOX_ENGINE_VERSION:-0.1.0}
|
|
BACKEND_DEV_SANDBOX_PLAYER_COUNT: ${BACKEND_DEV_SANDBOX_PLAYER_COUNT:-20}
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
|
# Per-game state directories live under the same absolute path
|
|
# both inside the backend container and on the Docker daemon host,
|
|
# so the bind-mount source the backend hands to the daemon
|
|
# resolves correctly when spawning engine containers. The dev
|
|
# environment uses a distinct prefix from `tools/local-dev/` so
|
|
# the two stacks do not collide on the same host.
|
|
# Game-state root must resolve to the same absolute path inside
|
|
# the backend container and on the Docker daemon host, because
|
|
# backend hands that path to the daemon when it spawns engine
|
|
# containers. The Makefile exports `GALAXY_DEV_GAME_STATE_DIR`
|
|
# to `${HOME}/.galaxy-dev/game-state` by default, so a non-root
|
|
# runner user can write to it without sudo.
|
|
- type: bind
|
|
source: ${GALAXY_DEV_GAME_STATE_DIR}
|
|
target: ${GALAXY_DEV_GAME_STATE_DIR}
|
|
bind:
|
|
create_host_path: true
|
|
# The geoip database lives on a named volume seeded by the
|
|
# `dev-deploy.yaml` workflow (or by `make seed-geoip` when
|
|
# bringing the stack up by hand). A bind-mount with a relative
|
|
# path would resolve against the runner's ephemeral workspace
|
|
# under /home/runner/.cache/act/<hash>/, which the runner
|
|
# deletes after the workflow ends — and the next
|
|
# `docker restart galaxy-dev-backend` would then fail with
|
|
# "not a directory" because the mount source vanished.
|
|
- galaxy-dev-geoip-data:/var/lib/galaxy:ro
|
|
networks:
|
|
- galaxy-internal
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8080/healthz"]
|
|
interval: 3s
|
|
timeout: 3s
|
|
retries: 60
|
|
start_period: 10s
|
|
|
|
galaxy-api:
|
|
build:
|
|
context: ../..
|
|
dockerfile: tools/local-dev/gateway.Dockerfile
|
|
image: galaxy/gateway:dev
|
|
container_name: galaxy-dev-api
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
depends_on:
|
|
galaxy-backend:
|
|
condition: service_healthy
|
|
galaxy-redis:
|
|
condition: service_healthy
|
|
environment:
|
|
GATEWAY_LOG_LEVEL: info
|
|
GATEWAY_PUBLIC_HTTP_ADDR: ":8080"
|
|
GATEWAY_AUTHENTICATED_GRPC_ADDR: ":9090"
|
|
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
|
|
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
|
|
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
|
|
GATEWAY_RESPONSE_SIGNER_PRIVATE_KEY_PEM_PATH: /run/secrets/gateway-response.pem
|
|
GATEWAY_REDIS_MASTER_ADDR: "galaxy-redis:6379"
|
|
GATEWAY_REDIS_PASSWORD: galaxy-dev
|
|
# UI lives on https://www.galaxy.lan; the API is on
|
|
# https://api.galaxy.lan. Browsers therefore issue cross-origin
|
|
# requests to the gateway and need an explicit allow-list.
|
|
GATEWAY_PUBLIC_HTTP_CORS_ALLOWED_ORIGINS: "https://www.galaxy.lan"
|
|
GATEWAY_AUTHENTICATED_GRPC_CORS_ALLOWED_ORIGINS: "https://www.galaxy.lan"
|
|
# Anti-abuse defaults are looser than production: the dev
|
|
# environment is shared by a handful of trusted testers who
|
|
# frequently hammer the same identity to reproduce flows.
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_MAX_BODY_BYTES: "131072"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_BROWSER_BOOTSTRAP_MAX_BODY_BYTES: "65536"
|
|
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_BROWSER_ASSET_MAX_BODY_BYTES: "65536"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_IP_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_IP_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_SESSION_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_SESSION_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_USER_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_USER_RATE_LIMIT_BURST: "1000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_MESSAGE_CLASS_RATE_LIMIT_REQUESTS: "10000"
|
|
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_MESSAGE_CLASS_RATE_LIMIT_BURST: "1000"
|
|
volumes:
|
|
- ../local-dev/keys/gateway-response.pem:/run/secrets/gateway-response.pem:ro
|
|
networks:
|
|
- galaxy-internal
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8080/healthz"]
|
|
interval: 3s
|
|
timeout: 3s
|
|
retries: 30
|
|
start_period: 5s
|
|
|
|
galaxy-caddy:
|
|
image: caddy:2.11.2-alpine
|
|
container_name: galaxy-dev-caddy
|
|
restart: unless-stopped
|
|
labels:
|
|
galaxy.stack: dev-deploy
|
|
depends_on:
|
|
galaxy-api:
|
|
condition: service_healthy
|
|
volumes:
|
|
- ./Caddyfile.dev:/etc/caddy/Caddyfile:ro
|
|
- galaxy-dev-caddy-data:/data
|
|
- galaxy-dev-ui-dist:/srv/galaxy-ui:ro
|
|
networks:
|
|
- galaxy-internal
|
|
- edge
|
|
|
|
networks:
|
|
galaxy-internal:
|
|
name: galaxy-dev-internal
|
|
driver: bridge
|
|
internal: false
|
|
edge:
|
|
name: ${GALAXY_EDGE_NETWORK:-edge}
|
|
external: true
|
|
|
|
# Note: `galaxy.stack=dev-deploy` is intentionally stamped only on
|
|
# services (containers). Stamping it on networks or named volumes
|
|
# changes the compose config-hash for those resources, and on a
|
|
# subsequent `compose up` compose tries to recreate them — for the
|
|
# `galaxy-dev-postgres-data` volume that means destroying the
|
|
# database, and for `galaxy-dev-internal` it can deadlock if any
|
|
# container is still attached. Per-container labels are sufficient
|
|
# for the CI/cleanup contract; we filter containers, not volumes or
|
|
# networks.
|
|
volumes:
|
|
galaxy-dev-postgres-data:
|
|
name: galaxy-dev-postgres-data
|
|
galaxy-dev-caddy-data:
|
|
name: galaxy-dev-caddy-data
|
|
galaxy-dev-ui-dist:
|
|
name: galaxy-dev-ui-dist
|
|
galaxy-dev-geoip-data:
|
|
name: galaxy-dev-geoip-data
|