f23da88028
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s
Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge.
373 lines
15 KiB
YAML
373 lines
15 KiB
YAML
# Full deploy descriptor for the Scrabble test contour: backend + gateway +
|
|
# Postgres + the Telegram connector (with its VPN sidecar) + the observability
|
|
# stack (OTel Collector -> Prometheus + Tempo -> Grafana). Driven by
|
|
# .gitea/workflows/ci.yaml (`docker compose up -d --build`); env values are
|
|
# interpolated from Gitea Actions TEST_ secrets/variables exported by the deploy
|
|
# job (see deploy/.env.example for the unprefixed names).
|
|
#
|
|
# Config bind sources are prefixed with ${SCRABBLE_CONFIG_DIR:-.}: locally they bind
|
|
# straight from this directory, but CI seeds them to a stable host path and sets
|
|
# SCRABBLE_CONFIG_DIR to it, because the runner's checkout is ephemeral (act removes
|
|
# it after the job) and the bind mounts must outlive the job in the long-running
|
|
# containers (see .gitea/workflows/ci.yaml + deploy/README.md).
|
|
#
|
|
# Networking (mirrors ../galaxy-game):
|
|
# - `internal` (scrabble-internal): all inter-service traffic, project-private
|
|
# DNS so service names never collide on the shared `edge` network.
|
|
# - `edge` (external): the host caddy reaches this contour at `scrabble:80`
|
|
# (the in-compose caddy's alias). The in-compose caddy terminates only HTTP in
|
|
# the test contour; the host caddy terminates TLS and forwards. For prod
|
|
# (no host caddy) set CADDY_SITE_ADDRESS to the domain so the caddy
|
|
# does its own ACME — the contour is then self-contained.
|
|
# - The connector egresses to api.telegram.org through the `vpn` sidecar
|
|
# (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
|
|
name: scrabble
|
|
|
|
# Bound every container's json-file logs. R7 measured the backend emitting a
|
|
# per-request latency line at info (~14 MiB / 30 min under the 500-player stress
|
|
# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each
|
|
# container at 30 MiB. Applied to every service via the *default-logging alias.
|
|
x-logging: &default-logging
|
|
driver: json-file
|
|
options:
|
|
max-size: "10m"
|
|
max-file: "3"
|
|
|
|
services:
|
|
postgres:
|
|
container_name: scrabble-postgres
|
|
image: postgres:17-alpine
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
environment:
|
|
POSTGRES_DB: ${POSTGRES_DB:-scrabble}
|
|
POSTGRES_USER: ${POSTGRES_USER:-scrabble}
|
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-scrabble} -d ${POSTGRES_DB:-scrabble}"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 30
|
|
volumes:
|
|
- postgres-data:/var/lib/postgresql/data
|
|
# R7 starting limits: 512M leaves headroom over the default 128 MB shared_buffers +
|
|
# per-connection memory (R2 peaked at 28 backends / 69 MiB RSS); tighten after the run.
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: "2.0"
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
backend:
|
|
container_name: scrabble-backend
|
|
image: scrabble-backend:latest
|
|
build:
|
|
context: ..
|
|
dockerfile: backend/Dockerfile
|
|
args:
|
|
DICT_VERSION: ${DICT_VERSION:-v1.0.0}
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
environment:
|
|
# search_path=backend matches the migrations (00001 creates the schema).
|
|
BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
|
|
# R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players;
|
|
# 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it.
|
|
BACKEND_POSTGRES_MAX_OPEN_CONNS: "40"
|
|
BACKEND_HTTP_ADDR: ":8080"
|
|
BACKEND_GRPC_ADDR: ":9090"
|
|
BACKEND_CONNECTOR_ADDR: telegram:9091
|
|
BACKEND_LOG_LEVEL: ${LOG_LEVEL:-info}
|
|
BACKEND_SERVICE_NAME: scrabble-backend
|
|
BACKEND_OTEL_TRACES_EXPORTER: otlp
|
|
BACKEND_OTEL_METRICS_EXPORTER: otlp
|
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
# GOMAXPROCS matches the CPU limit below so the Go scheduler aligns with the
|
|
# cgroup quota (the runtime otherwise sees all of the host's cores).
|
|
GOMAXPROCS: "2"
|
|
# No container healthcheck: the distroless image has no shell/wget. Readiness
|
|
# is covered by the CI post-deploy probe (GET / through caddy).
|
|
# R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tightened to
|
|
# the agreed prod values after the final stress run. deploy.resources.limits is
|
|
# honoured by `docker compose up` (Compose v2), not only by swarm.
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: "2.0"
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
gateway:
|
|
container_name: scrabble-gateway
|
|
image: scrabble-gateway:latest
|
|
build:
|
|
context: ..
|
|
dockerfile: gateway/Dockerfile
|
|
target: gateway
|
|
args:
|
|
VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-}
|
|
VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-}
|
|
VITE_TELEGRAM_GAME_CHANNEL_NAME_EN: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_EN:-}
|
|
VITE_TELEGRAM_GAME_CHANNEL_NAME_RU: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_RU:-}
|
|
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
|
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on: [backend]
|
|
environment:
|
|
GATEWAY_HTTP_ADDR: ":8081"
|
|
GATEWAY_BACKEND_HTTP_URL: http://backend:8080
|
|
GATEWAY_BACKEND_GRPC_ADDR: backend:9090
|
|
GATEWAY_CONNECTOR_ADDR: telegram:9091
|
|
GATEWAY_DEFAULT_SUPPORTED_LANGUAGES: ${GATEWAY_DEFAULT_SUPPORTED_LANGUAGES:-en,ru}
|
|
GATEWAY_LOG_LEVEL: ${LOG_LEVEL:-info}
|
|
GATEWAY_SERVICE_NAME: scrabble-gateway
|
|
GATEWAY_OTEL_TRACES_EXPORTER: otlp
|
|
GATEWAY_OTEL_METRICS_EXPORTER: otlp
|
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
# GOMAXPROCS matches the CPU limit below (see backend).
|
|
GOMAXPROCS: "3"
|
|
# GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
|
|
# caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
|
|
# R7 tuned: the gateway holds one h2c connection per player, so at 500 players it
|
|
# bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs
|
|
# the bursts. Per-connection overhead is the realistic prod cost — size for it.
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: "3.0"
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
# --- Landing (static) -------------------------------------------------------
|
|
# The public landing page in its own caddy container: the contour caddy
|
|
# routes the catch-all (notably /) here, the gateway keeps only /app/,
|
|
# /telegram/ and the Connect edge. Shares the gateway Dockerfile's UI build
|
|
# stage — identical build args keep that stage a single cached build.
|
|
landing:
|
|
container_name: scrabble-landing
|
|
image: scrabble-landing:latest
|
|
build:
|
|
context: ..
|
|
dockerfile: gateway/Dockerfile
|
|
target: landing
|
|
args:
|
|
VITE_TELEGRAM_BOT_ID: ${VITE_TELEGRAM_BOT_ID:-}
|
|
VITE_TELEGRAM_LINK: ${VITE_TELEGRAM_LINK:-}
|
|
VITE_TELEGRAM_GAME_CHANNEL_NAME_EN: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_EN:-}
|
|
VITE_TELEGRAM_GAME_CHANNEL_NAME_RU: ${VITE_TELEGRAM_GAME_CHANNEL_NAME_RU:-}
|
|
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
|
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 128M
|
|
networks: [internal]
|
|
|
|
# --- Telegram connector (egress via the VPN sidecar) -----------------------
|
|
vpn:
|
|
container_name: scrabble-telegram-vpn
|
|
image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
privileged: true
|
|
environment:
|
|
AWG_CONF: ${AWG_CONF:?set AWG_CONF}
|
|
networks:
|
|
internal:
|
|
aliases: [telegram]
|
|
|
|
telegram:
|
|
container_name: scrabble-telegram
|
|
image: scrabble-telegram:latest
|
|
build:
|
|
context: ..
|
|
dockerfile: platform/telegram/Dockerfile
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on: [vpn]
|
|
network_mode: "service:vpn"
|
|
environment:
|
|
# The bot tokens live ONLY in this container (ARCHITECTURE.md §12). At least
|
|
# one token is required (the connector validates this at boot).
|
|
TELEGRAM_BOT_TOKEN_EN: ${TELEGRAM_BOT_TOKEN_EN:-}
|
|
TELEGRAM_BOT_TOKEN_RU: ${TELEGRAM_BOT_TOKEN_RU:-}
|
|
TELEGRAM_GAME_CHANNEL_ID_EN: ${TELEGRAM_GAME_CHANNEL_ID_EN:-}
|
|
TELEGRAM_GAME_CHANNEL_ID_RU: ${TELEGRAM_GAME_CHANNEL_ID_RU:-}
|
|
TELEGRAM_MINIAPP_URL: ${TELEGRAM_MINIAPP_URL:?set TELEGRAM_MINIAPP_URL}
|
|
TELEGRAM_GRPC_ADDR: ":9091"
|
|
TELEGRAM_TEST_ENV: ${TELEGRAM_TEST_ENV:-false}
|
|
TELEGRAM_API_BASE_URL: ${TELEGRAM_API_BASE_URL:-}
|
|
TELEGRAM_LOG_LEVEL: ${LOG_LEVEL:-info}
|
|
TELEGRAM_SERVICE_NAME: scrabble-telegram
|
|
# The connector shares the VPN sidecar's netns. Routing to the collector's
|
|
# internal IP stays off the tunnel (connected route), but the sidecar's DNS
|
|
# hijacks name resolution: AWG_CONF must NOT carry a `DNS=` directive, else
|
|
# `otelcol` won't resolve ("produced zero addresses"). Without DNS= the netns
|
|
# uses Docker's resolver, which resolves both otelcol and api.telegram.org
|
|
# (see deploy/README.md).
|
|
TELEGRAM_OTEL_TRACES_EXPORTER: otlp
|
|
TELEGRAM_OTEL_METRICS_EXPORTER: otlp
|
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
# The connector is light (the stress run does not drive Telegram); one P suffices.
|
|
GOMAXPROCS: "1"
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: "1.0"
|
|
memory: 256M
|
|
|
|
# --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway;
|
|
# the catch-all incl. the landing -> the static landing container) -------
|
|
caddy:
|
|
container_name: scrabble-caddy
|
|
image: caddy:2-alpine
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on: [gateway, backend, grafana, landing]
|
|
environment:
|
|
# Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
|
|
CADDY_SITE_ADDRESS: ${CADDY_SITE_ADDRESS:-:80}
|
|
GM_BASICAUTH_USER: ${GM_BASICAUTH_USER:-gm}
|
|
GM_BASICAUTH_HASH: ${GM_BASICAUTH_HASH:?set GM_BASICAUTH_HASH}
|
|
volumes:
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/caddy/Caddyfile:/etc/caddy/Caddyfile:ro
|
|
- caddy-data:/data
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 128M
|
|
networks:
|
|
internal: {}
|
|
edge:
|
|
aliases: [scrabble]
|
|
|
|
# --- Observability ---------------------------------------------------------
|
|
otelcol:
|
|
container_name: scrabble-otelcol
|
|
image: otel/opentelemetry-collector-contrib:0.119.0
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
command: ["--config=/etc/otelcol/config.yaml"]
|
|
# The docker_stats receiver reads per-container metrics from the Docker API, so the
|
|
# collector (image UID 10001) joins the host's docker group to read the socket —
|
|
# DOCKER_GID defaults to the contour host's 989; set it for other hosts (prod). The
|
|
# socket is mounted read-only. This replaces cAdvisor, whose per-container metrics
|
|
# are empty on this host (separate-XFS /var/lib/docker).
|
|
group_add: ["${DOCKER_GID:-989}"]
|
|
volumes:
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/otelcol/config.yaml:/etc/otelcol/config.yaml:ro
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
prometheus:
|
|
container_name: scrabble-prometheus
|
|
image: prom/prometheus:v2.55.1
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
command:
|
|
- --config.file=/etc/prometheus/prometheus.yml
|
|
- --storage.tsdb.retention.time=15d
|
|
volumes:
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus-data:/prometheus
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
tempo:
|
|
container_name: scrabble-tempo
|
|
image: grafana/tempo:2.7.1
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
command: ["-config.file=/etc/tempo/tempo.yaml"]
|
|
volumes:
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
|
|
- tempo-data:/var/tempo
|
|
# R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2);
|
|
# raised to 2 GiB for headroom against OOM under sustained tracing load.
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 2G
|
|
networks: [internal]
|
|
|
|
grafana:
|
|
container_name: scrabble-grafana
|
|
image: grafana/grafana:11.4.0
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on: [prometheus, tempo]
|
|
environment:
|
|
# Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
|
|
# single shared login (caddy) gates it with no per-user Grafana accounts.
|
|
GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-/_gm/grafana/}
|
|
GF_SERVER_SERVE_FROM_SUB_PATH: "true"
|
|
GF_AUTH_ANONYMOUS_ENABLED: "true"
|
|
GF_AUTH_ANONYMOUS_ORG_ROLE: Admin
|
|
GF_AUTH_DISABLE_LOGIN_FORM: "true"
|
|
GF_AUTH_BASIC_ENABLED: "false"
|
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
# Disable Grafana Live: its WebSocket (/_gm/grafana/api/live/ws) otherwise hits
|
|
# caddy's Basic-Auth and re-prompts for the password on every dashboard; the
|
|
# dashboards poll and do not need Live.
|
|
GF_LIVE_MAX_CONNECTIONS: "0"
|
|
volumes:
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
# Dashboards live under /etc/grafana (NOT /var/lib/grafana, which the
|
|
# grafana-data volume mounts over — a nested bind there is shadowed and the
|
|
# provider logs "no such file or directory").
|
|
- ${SCRABBLE_CONFIG_DIR:-.}/grafana/dashboards:/etc/grafana/dashboards:ro
|
|
- grafana-data:/var/lib/grafana
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
networks: [internal]
|
|
|
|
# postgres_exporter exports Postgres server metrics (connections, cache hit ratio,
|
|
# transactions, database size). Prometheus scrapes it at :9187. The DSN reuses the
|
|
# contour Postgres credentials; sslmode=disable on the internal network.
|
|
postgres_exporter:
|
|
container_name: scrabble-postgres-exporter
|
|
image: prometheuscommunity/postgres-exporter:v0.16.0
|
|
restart: unless-stopped
|
|
logging: *default-logging
|
|
depends_on: [postgres]
|
|
environment:
|
|
DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 128M
|
|
networks: [internal]
|
|
|
|
networks:
|
|
internal:
|
|
name: scrabble-internal
|
|
edge:
|
|
external: true
|
|
|
|
volumes:
|
|
postgres-data:
|
|
caddy-data:
|
|
prometheus-data:
|
|
tempo-data:
|
|
grafana-data:
|