# Full deploy descriptor for the Scrabble test contour: backend + gateway + # Postgres + the Telegram connector (with its VPN sidecar) + the observability # stack (OTel Collector -> Prometheus + Tempo -> Grafana). Driven by # .gitea/workflows/ci.yaml (`docker compose up -d --build`); env values are # interpolated from Gitea Actions TEST_ secrets/variables exported by the deploy # job (see deploy/.env.example for the unprefixed names). # # Config bind sources are prefixed with ${SCRABBLE_CONFIG_DIR:-.}: locally they bind # straight from this directory, but CI seeds them to a stable host path and sets # SCRABBLE_CONFIG_DIR to it, because the runner's checkout is ephemeral (act removes # it after the job) and the bind mounts must outlive the job in the long-running # containers (see .gitea/workflows/ci.yaml + deploy/README.md). # # Networking (mirrors ../galaxy-game): # - `internal` (scrabble-internal): all inter-service traffic, project-private # DNS so service names never collide on the shared `edge` network. # - `edge` (external): the host caddy reaches this contour at `scrabble:80` # (the in-compose caddy's alias). The in-compose caddy terminates only HTTP in # the test contour; the host caddy terminates TLS and forwards. For prod # (no host caddy) set CADDY_SITE_ADDRESS to the domain so the caddy # does its own ACME — the contour is then self-contained. # - The connector egresses to api.telegram.org through the `vpn` sidecar # (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`. name: scrabble services: postgres: container_name: scrabble-postgres image: postgres:17-alpine restart: unless-stopped environment: POSTGRES_DB: ${POSTGRES_DB:-scrabble} POSTGRES_USER: ${POSTGRES_USER:-scrabble} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD} healthcheck: test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-scrabble} -d ${POSTGRES_DB:-scrabble}"] interval: 5s timeout: 3s retries: 30 volumes: - postgres-data:/var/lib/postgresql/data # R7 starting limits: 512M leaves headroom over the default 128 MB shared_buffers + # per-connection memory (R2 peaked at 28 backends / 69 MiB RSS); tighten after the run. deploy: resources: limits: cpus: "2.0" memory: 512M networks: [internal] backend: container_name: scrabble-backend image: scrabble-backend:latest build: context: .. dockerfile: backend/Dockerfile args: DICT_VERSION: ${DICT_VERSION:-v1.0.0} restart: unless-stopped depends_on: postgres: condition: service_healthy environment: # search_path=backend matches the migrations (00001 creates the schema). BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend BACKEND_HTTP_ADDR: ":8080" BACKEND_GRPC_ADDR: ":9090" BACKEND_CONNECTOR_ADDR: telegram:9091 BACKEND_LOG_LEVEL: ${LOG_LEVEL:-info} BACKEND_SERVICE_NAME: scrabble-backend BACKEND_OTEL_TRACES_EXPORTER: otlp BACKEND_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" # GOMAXPROCS matches the CPU limit below so the Go scheduler aligns with the # cgroup quota (the runtime otherwise sees all of the host's cores). GOMAXPROCS: "2" # No container healthcheck: the distroless image has no shell/wget. Readiness # is covered by the CI post-deploy probe (GET / through caddy). # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tightened to # the agreed prod values after the final stress run. deploy.resources.limits is # honoured by `docker compose up` (Compose v2), not only by swarm. deploy: resources: limits: cpus: "2.0" memory: 512M networks: [internal] gateway: container_name: scrabble-gateway image: scrabble-gateway:latest build: context: .. dockerfile: gateway/Dockerfile target: gateway args: VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-} VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-} VITE_TELEGRAM_GAME_CHANNEL_NAME_EN: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_EN:-} VITE_TELEGRAM_GAME_CHANNEL_NAME_RU: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_RU:-} VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped depends_on: [backend] environment: GATEWAY_HTTP_ADDR: ":8081" GATEWAY_BACKEND_HTTP_URL: http://backend:8080 GATEWAY_BACKEND_GRPC_ADDR: backend:9090 GATEWAY_CONNECTOR_ADDR: telegram:9091 GATEWAY_DEFAULT_SUPPORTED_LANGUAGES: ${GATEWAY_DEFAULT_SUPPORTED_LANGUAGES:-en,ru} GATEWAY_LOG_LEVEL: ${LOG_LEVEL:-info} GATEWAY_SERVICE_NAME: scrabble-gateway GATEWAY_OTEL_TRACES_EXPORTER: otlp GATEWAY_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" # GOMAXPROCS matches the CPU limit below (see backend). GOMAXPROCS: "2" # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after # the final stress run. deploy: resources: limits: cpus: "2.0" memory: 512M networks: [internal] # --- Landing (static) ------------------------------------------------------- # The public landing page in its own caddy container: the contour caddy # routes the catch-all (notably /) here, the gateway keeps only /app/, # /telegram/ and the Connect edge. Shares the gateway Dockerfile's UI build # stage — identical build args keep that stage a single cached build. landing: container_name: scrabble-landing image: scrabble-landing:latest build: context: .. dockerfile: gateway/Dockerfile target: landing args: VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-} VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-} VITE_TELEGRAM_GAME_CHANNEL_NAME_EN: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_EN:-} VITE_TELEGRAM_GAME_CHANNEL_NAME_RU: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_RU:-} VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped deploy: resources: limits: memory: 128M networks: [internal] # --- Telegram connector (egress via the VPN sidecar) ----------------------- vpn: container_name: scrabble-telegram-vpn image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest restart: unless-stopped privileged: true environment: AWG_CONF: ${AWG_CONF:?set AWG_CONF} networks: internal: aliases: [telegram] telegram: container_name: scrabble-telegram image: scrabble-telegram:latest build: context: .. dockerfile: platform/telegram/Dockerfile restart: unless-stopped depends_on: [vpn] network_mode: "service:vpn" environment: # The bot tokens live ONLY in this container (ARCHITECTURE.md §12). At least # one token is required (the connector validates this at boot). TELEGRAM_BOT_TOKEN_EN: ${TELEGRAM_BOT_TOKEN_EN:-} TELEGRAM_BOT_TOKEN_RU: ${TELEGRAM_BOT_TOKEN_RU:-} TELEGRAM_GAME_CHANNEL_ID_EN: ${TELEGRAM_GAME_CHANNEL_ID_EN:-} TELEGRAM_GAME_CHANNEL_ID_RU: ${TELEGRAM_GAME_CHANNEL_ID_RU:-} TELEGRAM_MINIAPP_URL: ${TELEGRAM_MINIAPP_URL:?set TELEGRAM_MINIAPP_URL} TELEGRAM_GRPC_ADDR: ":9091" TELEGRAM_TEST_ENV: ${TELEGRAM_TEST_ENV:-false} TELEGRAM_API_BASE_URL: ${TELEGRAM_API_BASE_URL:-} TELEGRAM_LOG_LEVEL: ${LOG_LEVEL:-info} TELEGRAM_SERVICE_NAME: scrabble-telegram # The connector shares the VPN sidecar's netns. Routing to the collector's # internal IP stays off the tunnel (connected route), but the sidecar's DNS # hijacks name resolution: AWG_CONF must NOT carry a `DNS=` directive, else # `otelcol` won't resolve ("produced zero addresses"). Without DNS= the netns # uses Docker's resolver, which resolves both otelcol and api.telegram.org # (see deploy/README.md). TELEGRAM_OTEL_TRACES_EXPORTER: otlp TELEGRAM_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" # The connector is light (the stress run does not drive Telegram); one P suffices. GOMAXPROCS: "1" deploy: resources: limits: cpus: "1.0" memory: 256M # --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway; # the catch-all incl. the landing -> the static landing container) ------- caddy: container_name: scrabble-caddy image: caddy:2-alpine restart: unless-stopped depends_on: [gateway, backend, grafana, landing] environment: # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME. CADDY_SITE_ADDRESS: ${CADDY_SITE_ADDRESS:-:80} GM_BASICAUTH_USER: ${GM_BASICAUTH_USER:-gm} GM_BASICAUTH_HASH: ${GM_BASICAUTH_HASH:?set GM_BASICAUTH_HASH} volumes: - ${SCRABBLE_CONFIG_DIR:-.}/caddy/Caddyfile:/etc/caddy/Caddyfile:ro - caddy-data:/data deploy: resources: limits: memory: 128M networks: internal: {} edge: aliases: [scrabble] # --- Observability --------------------------------------------------------- otelcol: container_name: scrabble-otelcol image: otel/opentelemetry-collector-contrib:0.119.0 restart: unless-stopped command: ["--config=/etc/otelcol/config.yaml"] # The docker_stats receiver reads per-container metrics from the Docker API, so the # collector (image UID 10001) joins the host's docker group to read the socket — # DOCKER_GID defaults to the contour host's 989; set it for other hosts (prod). The # socket is mounted read-only. This replaces cAdvisor, whose per-container metrics # are empty on this host (separate-XFS /var/lib/docker). group_add: ["${DOCKER_GID:-989}"] volumes: - ${SCRABBLE_CONFIG_DIR:-.}/otelcol/config.yaml:/etc/otelcol/config.yaml:ro - /var/run/docker.sock:/var/run/docker.sock:ro deploy: resources: limits: memory: 512M networks: [internal] prometheus: container_name: scrabble-prometheus image: prom/prometheus:v2.55.1 restart: unless-stopped command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=15d volumes: - ${SCRABBLE_CONFIG_DIR:-.}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus-data:/prometheus deploy: resources: limits: memory: 512M networks: [internal] tempo: container_name: scrabble-tempo image: grafana/tempo:2.7.1 restart: unless-stopped command: ["-config.file=/etc/tempo/tempo.yaml"] volumes: - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - tempo-data:/var/tempo # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run. deploy: resources: limits: memory: 1G networks: [internal] grafana: container_name: scrabble-grafana image: grafana/grafana:11.4.0 restart: unless-stopped depends_on: [prometheus, tempo] environment: # Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a # single shared login (caddy) gates it with no per-user Grafana accounts. GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-/_gm/grafana/} GF_SERVER_SERVE_FROM_SUB_PATH: "true" GF_AUTH_ANONYMOUS_ENABLED: "true" GF_AUTH_ANONYMOUS_ORG_ROLE: Admin GF_AUTH_DISABLE_LOGIN_FORM: "true" GF_AUTH_BASIC_ENABLED: "false" GF_USERS_ALLOW_SIGN_UP: "false" GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} # Disable Grafana Live: its WebSocket (/_gm/grafana/api/live/ws) otherwise hits # caddy's Basic-Auth and re-prompts for the password on every dashboard; the # dashboards poll and do not need Live. GF_LIVE_MAX_CONNECTIONS: "0" volumes: - ${SCRABBLE_CONFIG_DIR:-.}/grafana/provisioning:/etc/grafana/provisioning:ro # Dashboards live under /etc/grafana (NOT /var/lib/grafana, which the # grafana-data volume mounts over — a nested bind there is shadowed and the # provider logs "no such file or directory"). - ${SCRABBLE_CONFIG_DIR:-.}/grafana/dashboards:/etc/grafana/dashboards:ro - grafana-data:/var/lib/grafana deploy: resources: limits: memory: 512M networks: [internal] # postgres_exporter exports Postgres server metrics (connections, cache hit ratio, # transactions, database size). Prometheus scrapes it at :9187. The DSN reuses the # contour Postgres credentials; sslmode=disable on the internal network. postgres_exporter: container_name: scrabble-postgres-exporter image: prometheuscommunity/postgres-exporter:v0.16.0 restart: unless-stopped depends_on: [postgres] environment: DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable deploy: resources: limits: memory: 128M networks: [internal] networks: internal: name: scrabble-internal edge: external: true volumes: postgres-data: caddy-data: prometheus-data: tempo-data: grafana-data: