R7: apply the agreed tuning from the final stress run
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s

Round-2 tuning, decided from the 500-player resource profile:
- gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so
  at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state);
  3 cores absorbs the bursts. The per-connection cost is the realistic prod load.
- tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk).
- backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap
  (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it.
- docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied
  contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load
  and was previously unbounded. Log level stays info.

backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap
on the shared host). A validation re-run confirms the gateway fix before merge.
This commit is contained in:
Ilia Denisov
2026-06-11 10:33:58 +02:00
parent 8eee018728
commit f23da88028
+33 -6
View File
@@ -23,11 +23,22 @@
# (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`. # (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
name: scrabble name: scrabble
# Bound every container's json-file logs. R7 measured the backend emitting a
# per-request latency line at info (~14 MiB / 30 min under the 500-player stress
# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each
# container at 30 MiB. Applied to every service via the *default-logging alias.
x-logging: &default-logging
driver: json-file
options:
max-size: "10m"
max-file: "3"
services: services:
postgres: postgres:
container_name: scrabble-postgres container_name: scrabble-postgres
image: postgres:17-alpine image: postgres:17-alpine
restart: unless-stopped restart: unless-stopped
logging: *default-logging
environment: environment:
POSTGRES_DB: ${POSTGRES_DB:-scrabble} POSTGRES_DB: ${POSTGRES_DB:-scrabble}
POSTGRES_USER: ${POSTGRES_USER:-scrabble} POSTGRES_USER: ${POSTGRES_USER:-scrabble}
@@ -57,12 +68,16 @@ services:
args: args:
DICT_VERSION: ${DICT_VERSION:-v1.0.0} DICT_VERSION: ${DICT_VERSION:-v1.0.0}
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy
environment: environment:
# search_path=backend matches the migrations (00001 creates the schema). # search_path=backend matches the migrations (00001 creates the schema).
BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
# R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players;
# 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it.
BACKEND_POSTGRES_MAX_OPEN_CONNS: "40"
BACKEND_HTTP_ADDR: ":8080" BACKEND_HTTP_ADDR: ":8080"
BACKEND_GRPC_ADDR: ":9090" BACKEND_GRPC_ADDR: ":9090"
BACKEND_CONNECTOR_ADDR: telegram:9091 BACKEND_CONNECTOR_ADDR: telegram:9091
@@ -102,6 +117,7 @@ services:
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
VITE_APP_VERSION: ${APP_VERSION:-dev} VITE_APP_VERSION: ${APP_VERSION:-dev}
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: [backend] depends_on: [backend]
environment: environment:
GATEWAY_HTTP_ADDR: ":8081" GATEWAY_HTTP_ADDR: ":8081"
@@ -116,15 +132,16 @@ services:
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
OTEL_EXPORTER_OTLP_INSECURE: "true" OTEL_EXPORTER_OTLP_INSECURE: "true"
# GOMAXPROCS matches the CPU limit below (see backend). # GOMAXPROCS matches the CPU limit below (see backend).
GOMAXPROCS: "2" GOMAXPROCS: "3"
# GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
# caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
# R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after # R7 tuned: the gateway holds one h2c connection per player, so at 500 players it
# the final stress run. # bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs
# the bursts. Per-connection overhead is the realistic prod cost — size for it.
deploy: deploy:
resources: resources:
limits: limits:
cpus: "2.0" cpus: "3.0"
memory: 512M memory: 512M
networks: [internal] networks: [internal]
@@ -148,6 +165,7 @@ services:
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
VITE_APP_VERSION: ${APP_VERSION:-dev} VITE_APP_VERSION: ${APP_VERSION:-dev}
restart: unless-stopped restart: unless-stopped
logging: *default-logging
deploy: deploy:
resources: resources:
limits: limits:
@@ -159,6 +177,7 @@ services:
container_name: scrabble-telegram-vpn container_name: scrabble-telegram-vpn
image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
restart: unless-stopped restart: unless-stopped
logging: *default-logging
privileged: true privileged: true
environment: environment:
AWG_CONF: ${AWG_CONF:?set AWG_CONF} AWG_CONF: ${AWG_CONF:?set AWG_CONF}
@@ -173,6 +192,7 @@ services:
context: .. context: ..
dockerfile: platform/telegram/Dockerfile dockerfile: platform/telegram/Dockerfile
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: [vpn] depends_on: [vpn]
network_mode: "service:vpn" network_mode: "service:vpn"
environment: environment:
@@ -212,6 +232,7 @@ services:
container_name: scrabble-caddy container_name: scrabble-caddy
image: caddy:2-alpine image: caddy:2-alpine
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: [gateway, backend, grafana, landing] depends_on: [gateway, backend, grafana, landing]
environment: environment:
# Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME. # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
@@ -235,6 +256,7 @@ services:
container_name: scrabble-otelcol container_name: scrabble-otelcol
image: otel/opentelemetry-collector-contrib:0.119.0 image: otel/opentelemetry-collector-contrib:0.119.0
restart: unless-stopped restart: unless-stopped
logging: *default-logging
command: ["--config=/etc/otelcol/config.yaml"] command: ["--config=/etc/otelcol/config.yaml"]
# The docker_stats receiver reads per-container metrics from the Docker API, so the # The docker_stats receiver reads per-container metrics from the Docker API, so the
# collector (image UID 10001) joins the host's docker group to read the socket — # collector (image UID 10001) joins the host's docker group to read the socket —
@@ -255,6 +277,7 @@ services:
container_name: scrabble-prometheus container_name: scrabble-prometheus
image: prom/prometheus:v2.55.1 image: prom/prometheus:v2.55.1
restart: unless-stopped restart: unless-stopped
logging: *default-logging
command: command:
- --config.file=/etc/prometheus/prometheus.yml - --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time=15d - --storage.tsdb.retention.time=15d
@@ -271,21 +294,24 @@ services:
container_name: scrabble-tempo container_name: scrabble-tempo
image: grafana/tempo:2.7.1 image: grafana/tempo:2.7.1
restart: unless-stopped restart: unless-stopped
logging: *default-logging
command: ["-config.file=/etc/tempo/tempo.yaml"] command: ["-config.file=/etc/tempo/tempo.yaml"]
volumes: volumes:
- ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
- tempo-data:/var/tempo - tempo-data:/var/tempo
# tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run. # R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2);
# raised to 2 GiB for headroom against OOM under sustained tracing load.
deploy: deploy:
resources: resources:
limits: limits:
memory: 1G memory: 2G
networks: [internal] networks: [internal]
grafana: grafana:
container_name: scrabble-grafana container_name: scrabble-grafana
image: grafana/grafana:11.4.0 image: grafana/grafana:11.4.0
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: [prometheus, tempo] depends_on: [prometheus, tempo]
environment: environment:
# Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a # Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
@@ -322,6 +348,7 @@ services:
container_name: scrabble-postgres-exporter container_name: scrabble-postgres-exporter
image: prometheuscommunity/postgres-exporter:v0.16.0 image: prometheuscommunity/postgres-exporter:v0.16.0
restart: unless-stopped restart: unless-stopped
logging: *default-logging
depends_on: [postgres] depends_on: [postgres]
environment: environment:
DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable