Files
galaxy-game/tools/dev-deploy/docker-compose.yml
T
Ilia Denisov 84a0ccb23f feat(dev-deploy): full observability stack (Prometheus/Grafana/Loki/Tempo)
Stand up a production-mirror monitoring stack in the long-lived dev
contour, all on galaxy-dev-internal with no host ports (reached only via
the in-repo galaxy-dev-caddy):

- Prometheus scrapes backend:9100, gateway:9191, node-exporter and
  cadvisor (30s interval, 15d retention); Loki (7d) + promtail (Docker
  service discovery by the galaxy.stack=dev-deploy label) for logs;
  Tempo (3d) for traces.
- Backend and gateway now export OTLP traces to Tempo over plaintext
  gRPC on the internal network (OTEL_EXPORTER_OTLP_INSECURE).
- Grafana provisioned as code (Prometheus/Loki/Tempo datasources plus a
  starter dashboard), served under /grafana/ via Caddy sub-path mode;
  admin password from the GALAXY_DEV_GRAFANA_ADMIN_PASSWORD secret.
- Expose the Mailpit capture UI under /mailpit/ (Caddy basic-auth +
  MP_WEBROOT) so every captured message is readable regardless of relay.
- dev-deploy.yaml seeds the monitoring config to a stable, reboot-
  surviving host path and injects the Grafana admin secret.

Per-service memory limits keep the footprint within budget. All
collector config lives under tools/dev-deploy/monitoring/ for dev/prod
parity.
2026-05-31 23:39:06 +02:00

471 lines
17 KiB
YAML

# Long-lived dev environment for the Galaxy stack, deployed by the
# `dev-deploy.yaml` Gitea Actions workflow on every merge into the
# `development` branch and (optionally) by `make -C tools/dev-deploy up`
# from a developer shell on the same host.
#
# The stack is reachable from a browser only through the host Caddy on
# the machine, which terminates TLS and forwards `*.galaxy.lan` into the
# external `edge` Docker network where `galaxy-caddy` does app-routing.
# No service in this compose project binds a host port — coexistence
# with `tools/local-dev/` (which listens on localhost:5433/6380/8025/...)
# is achieved by distinct names, networks, and volumes.
#
# Browser → host-Caddy (:80/:443) → galaxy-caddy → {galaxy-api, /srv/galaxy-ui}
#
# Persistent state lives in named volumes under the `galaxy-dev-*`
# prefix; surviving redeploys across compose rebuilds.
name: galaxy-dev
services:
galaxy-postgres:
image: postgres:16-alpine
container_name: galaxy-dev-postgres
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
environment:
POSTGRES_USER: galaxy
POSTGRES_PASSWORD: galaxy
POSTGRES_DB: galaxy_backend
volumes:
- galaxy-dev-postgres-data:/var/lib/postgresql/data
networks:
- galaxy-internal
healthcheck:
test: ["CMD-SHELL", "pg_isready -U galaxy -d galaxy_backend"]
interval: 3s
timeout: 3s
retries: 30
start_period: 5s
galaxy-redis:
image: redis:7-alpine
container_name: galaxy-dev-redis
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- redis-server
- --requirepass
- galaxy-dev
- --appendonly
- "no"
- --save
- ""
networks:
- galaxy-internal
healthcheck:
test: ["CMD", "redis-cli", "-a", "galaxy-dev", "PING"]
interval: 3s
timeout: 3s
retries: 30
start_period: 3s
galaxy-mailpit:
image: axllent/mailpit:v1.21
container_name: galaxy-dev-mailpit
restart: unless-stopped
# Mailpit is both the SMTP submission point and a relay: it captures
# every message in its UI and auto-relays the ones whose recipient
# matches GALAXY_DEV_MAIL_RELAY_MATCH to the Gmail account in the
# secret-rendered relay config. The default match is non-routable, so
# a stack brought up without the relay secret only captures, never sends.
command:
- "--smtp-relay-config=/etc/mailpit/relay.conf"
- "--smtp-relay-matching=${GALAXY_DEV_MAIL_RELAY_MATCH:-nobody@invalid.example}"
# Serve the capture UI under /mailpit so the host Caddy can expose it
# at https://galaxy.lan/mailpit/ (behind basic-auth); SMTP is unaffected.
- "--webroot=/mailpit"
labels:
galaxy.stack: dev-deploy
networks:
- galaxy-internal
volumes:
- galaxy-dev-mailpit-config:/etc/mailpit:ro
healthcheck:
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8025/livez"]
interval: 3s
timeout: 3s
retries: 30
start_period: 3s
galaxy-backend:
build:
context: ../..
dockerfile: tools/local-dev/backend.Dockerfile
image: galaxy/backend:dev
container_name: galaxy-dev-backend
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
user: "0:0"
depends_on:
galaxy-postgres:
condition: service_healthy
galaxy-mailpit:
condition: service_healthy
environment:
BACKEND_LOGGING_LEVEL: info
BACKEND_HTTP_LISTEN_ADDR: ":8080"
BACKEND_GRPC_PUSH_LISTEN_ADDR: ":8081"
BACKEND_POSTGRES_DSN: "postgres://galaxy:galaxy@galaxy-postgres:5432/galaxy_backend?search_path=backend&sslmode=disable"
BACKEND_SMTP_HOST: galaxy-mailpit
BACKEND_SMTP_PORT: "1025"
BACKEND_SMTP_FROM: "galaxy-backend@galaxy.lan"
BACKEND_SMTP_TLS_MODE: none
BACKEND_DOCKER_NETWORK: galaxy-dev-internal
BACKEND_STACK_LABEL: dev-deploy
BACKEND_GAME_STATE_ROOT: ${GALAXY_DEV_GAME_STATE_DIR}
BACKEND_GEOIP_DB_PATH: /var/lib/galaxy/geoip.mmdb
BACKEND_NOTIFICATION_ADMIN_EMAIL: admin@galaxy.lan
BACKEND_MAIL_WORKER_INTERVAL: 500ms
BACKEND_NOTIFICATION_WORKER_INTERVAL: 500ms
BACKEND_OTEL_TRACES_EXPORTER: otlp
BACKEND_OTEL_PROTOCOL: grpc
BACKEND_OTEL_ENDPOINT: "galaxy-tempo:4317"
# Tempo's OTLP receiver is plaintext on the internal network; the
# backend's gRPC exporter defaults to TLS, so disable it via the
# standard SDK env (applied on top of WithEndpoint).
OTEL_EXPORTER_OTLP_INSECURE: "true"
# Prometheus metrics are enabled in dev so the `/metrics` scrape
# endpoint is live and stable ahead of standing up a Prometheus +
# Grafana stack on the internal network. The listener stays internal
# (not mapped to the host); nothing scrapes it yet.
BACKEND_OTEL_METRICS_EXPORTER: prometheus
BACKEND_OTEL_PROMETHEUS_LISTEN_ADDR: ":9100"
# Operator console (`/_gm`): Basic Auth bootstrap account plus the
# stateless CSRF key. Dev-only non-secrets, overridable via `.env`; a
# stable CSRF key keeps console forms valid across redeploys.
BACKEND_ADMIN_BOOTSTRAP_USER: ${BACKEND_ADMIN_BOOTSTRAP_USER:-gm}
BACKEND_ADMIN_BOOTSTRAP_PASSWORD: ${BACKEND_ADMIN_BOOTSTRAP_PASSWORD:-gm-dev-password}
BACKEND_ADMIN_CONSOLE_CSRF_KEY: ${BACKEND_ADMIN_CONSOLE_CSRF_KEY:-dev-admin-console-csrf-key}
# Long-lived dev environment always opts into the fixed-code
# override so a returning developer can sign in with `123456`
# even after the matching browser session was cleared (the real
# bcrypt-hashed code is single-use). Set the var to an empty
# string in `.env` to disable.
BACKEND_AUTH_DEV_FIXED_CODE: ${BACKEND_AUTH_DEV_FIXED_CODE:-123456}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
# Per-game state directories live under the same absolute path
# both inside the backend container and on the Docker daemon host,
# so the bind-mount source the backend hands to the daemon
# resolves correctly when spawning engine containers. The dev
# environment uses a distinct prefix from `tools/local-dev/` so
# the two stacks do not collide on the same host.
# Game-state root must resolve to the same absolute path inside
# the backend container and on the Docker daemon host, because
# backend hands that path to the daemon when it spawns engine
# containers. The Makefile exports `GALAXY_DEV_GAME_STATE_DIR`
# to `${HOME}/.galaxy-dev/game-state` by default, so a non-root
# runner user can write to it without sudo.
- type: bind
source: ${GALAXY_DEV_GAME_STATE_DIR}
target: ${GALAXY_DEV_GAME_STATE_DIR}
bind:
create_host_path: true
# The geoip database lives on a named volume seeded by the
# `dev-deploy.yaml` workflow (or by `make seed-geoip` when
# bringing the stack up by hand). A bind-mount with a relative
# path would resolve against the runner's ephemeral workspace
# under /home/runner/.cache/act/<hash>/, which the runner
# deletes after the workflow ends — and the next
# `docker restart galaxy-dev-backend` would then fail with
# "not a directory" because the mount source vanished.
- galaxy-dev-geoip-data:/var/lib/galaxy:ro
networks:
- galaxy-internal
healthcheck:
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8080/healthz"]
interval: 3s
timeout: 3s
retries: 60
start_period: 10s
galaxy-api:
build:
context: ../..
dockerfile: tools/local-dev/gateway.Dockerfile
image: galaxy/gateway:dev
container_name: galaxy-dev-api
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
galaxy-backend:
condition: service_healthy
galaxy-redis:
condition: service_healthy
environment:
GATEWAY_LOG_LEVEL: info
GATEWAY_PUBLIC_HTTP_ADDR: ":8080"
GATEWAY_AUTHENTICATED_GRPC_ADDR: ":9090"
# Private admin listener exposes the Prometheus `/metrics` endpoint on
# the internal network — live and stable for a future scrape, not
# mapped to the host.
GATEWAY_ADMIN_HTTP_ADDR: ":9191"
# Traces -> Tempo over OTLP gRPC (plaintext on the internal net).
OTEL_SERVICE_NAME: galaxy-gateway
OTEL_TRACES_EXPORTER: otlp
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_EXPORTER_OTLP_ENDPOINT: "http://galaxy-tempo:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
GATEWAY_BACKEND_HTTP_URL: "http://galaxy-backend:8080"
GATEWAY_BACKEND_GRPC_PUSH_URL: "galaxy-backend:8081"
GATEWAY_BACKEND_GATEWAY_CLIENT_ID: dev-gateway-1
GATEWAY_RESPONSE_SIGNER_PRIVATE_KEY_PEM_PATH: /run/secrets/gateway-response.pem
GATEWAY_REDIS_MASTER_ADDR: "galaxy-redis:6379"
GATEWAY_REDIS_PASSWORD: galaxy-dev
# Single-origin deployment: the UI, public REST, and Connect-Web
# edge share one host, so browser requests are same-origin and
# CORS is not needed. An empty allow-list disables the CORS
# middleware (requests pass through without Access-Control-*
# headers). Re-populate these only if a future deploy fronts the
# gateway on a different host than the UI.
GATEWAY_PUBLIC_HTTP_CORS_ALLOWED_ORIGINS: ""
GATEWAY_AUTHENTICATED_GRPC_CORS_ALLOWED_ORIGINS: ""
# Anti-abuse defaults are looser than production: the dev
# environment is shared by a handful of trusted testers who
# frequently hammer the same identity to reproduce flows.
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_AUTH_RATE_LIMIT_BURST: "1000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_SEND_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST: "1000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_CONFIRM_EMAIL_CODE_IDENTITY_RATE_LIMIT_BURST: "1000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_MAX_BODY_BYTES: "131072"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_PUBLIC_MISC_RATE_LIMIT_BURST: "1000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_BROWSER_BOOTSTRAP_MAX_BODY_BYTES: "65536"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_BROWSER_ASSET_MAX_BODY_BYTES: "65536"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_ADMIN_MAX_BODY_BYTES: "131072"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_ADMIN_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_PUBLIC_HTTP_ANTI_ABUSE_ADMIN_RATE_LIMIT_BURST: "1000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_IP_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_IP_RATE_LIMIT_BURST: "1000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_SESSION_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_SESSION_RATE_LIMIT_BURST: "1000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_USER_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_USER_RATE_LIMIT_BURST: "1000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_MESSAGE_CLASS_RATE_LIMIT_REQUESTS: "10000"
GATEWAY_AUTHENTICATED_GRPC_ANTI_ABUSE_MESSAGE_CLASS_RATE_LIMIT_BURST: "1000"
volumes:
- ../local-dev/keys/gateway-response.pem:/run/secrets/gateway-response.pem:ro
networks:
- galaxy-internal
healthcheck:
test: ["CMD", "wget", "-q", "-O-", "http://localhost:8080/healthz"]
interval: 3s
timeout: 3s
retries: 30
start_period: 5s
galaxy-caddy:
image: caddy:2.11.2-alpine
container_name: galaxy-dev-caddy
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
galaxy-api:
condition: service_healthy
volumes:
- ./Caddyfile.dev:/etc/caddy/Caddyfile:ro
- galaxy-dev-caddy-data:/data
- galaxy-dev-ui-dist:/srv/galaxy-ui:ro
- galaxy-dev-site-dist:/srv/galaxy-site:ro
networks:
- galaxy-internal
- edge
galaxy-prometheus:
image: prom/prometheus:v2.55.1
container_name: galaxy-dev-prometheus
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-lifecycle
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- galaxy-dev-prometheus-data:/prometheus
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-loki:
image: grafana/loki:3.3.2
container_name: galaxy-dev-loki
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/loki/loki.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/loki/loki.yml:/etc/loki/loki.yml:ro
- galaxy-dev-loki-data:/loki
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-promtail:
image: grafana/promtail:3.3.2
container_name: galaxy-dev-promtail
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/promtail/promtail.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/promtail/promtail.yml:/etc/promtail/promtail.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 128m
galaxy-tempo:
image: grafana/tempo:2.7.1
container_name: galaxy-dev-tempo
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command: ["-config.file=/etc/tempo/tempo.yml"]
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/tempo/tempo.yml:/etc/tempo/tempo.yml:ro
- galaxy-dev-tempo-data:/var/tempo
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 384m
galaxy-node-exporter:
image: prom/node-exporter:v1.8.2
container_name: galaxy-dev-node-exporter
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
pid: host
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 64m
galaxy-cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: galaxy-dev-cadvisor
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
command:
- --housekeeping_interval=30s
- --docker_only=true
- --store_container_labels=false
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
galaxy-grafana:
image: grafana/grafana:11.4.0
container_name: galaxy-dev-grafana
restart: unless-stopped
labels:
galaxy.stack: dev-deploy
depends_on:
- galaxy-prometheus
- galaxy-loki
- galaxy-tempo
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GALAXY_DEV_GRAFANA_ADMIN_PASSWORD:-admin}
GF_SERVER_ROOT_URL: https://galaxy.lan/grafana/
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_ANALYTICS_REPORTING_ENABLED: "false"
GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
GF_NEWS_NEWS_FEED_ENABLED: "false"
volumes:
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/provisioning:/etc/grafana/provisioning:ro
- ${GALAXY_DEV_MONITORING_DIR:-./monitoring}/grafana/dashboards:/var/lib/grafana/dashboards:ro
- galaxy-dev-grafana-data:/var/lib/grafana
networks:
- galaxy-internal
deploy:
resources:
limits:
memory: 256m
networks:
galaxy-internal:
name: galaxy-dev-internal
driver: bridge
internal: false
edge:
name: ${GALAXY_EDGE_NETWORK:-edge}
external: true
# Note: `galaxy.stack=dev-deploy` is intentionally stamped only on
# services (containers). Stamping it on networks or named volumes
# changes the compose config-hash for those resources, and on a
# subsequent `compose up` compose tries to recreate them — for the
# `galaxy-dev-postgres-data` volume that means destroying the
# database, and for `galaxy-dev-internal` it can deadlock if any
# container is still attached. Per-container labels are sufficient
# for the CI/cleanup contract; we filter containers, not volumes or
# networks.
volumes:
galaxy-dev-postgres-data:
name: galaxy-dev-postgres-data
galaxy-dev-caddy-data:
name: galaxy-dev-caddy-data
galaxy-dev-ui-dist:
name: galaxy-dev-ui-dist
galaxy-dev-site-dist:
name: galaxy-dev-site-dist
galaxy-dev-geoip-data:
name: galaxy-dev-geoip-data
galaxy-dev-mailpit-config:
name: galaxy-dev-mailpit-config
galaxy-dev-prometheus-data:
name: galaxy-dev-prometheus-data
galaxy-dev-grafana-data:
name: galaxy-dev-grafana-data
galaxy-dev-loki-data:
name: galaxy-dev-loki-data
galaxy-dev-tempo-data:
name: galaxy-dev-tempo-data