From f23da88028bf64d619f54fcddf44ba4234901373 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 11 Jun 2026 10:33:58 +0200 Subject: [PATCH] R7: apply the agreed tuning from the final stress run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge. --- deploy/docker-compose.yml | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 038c156..5702f17 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -23,11 +23,22 @@ # (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`. name: scrabble +# Bound every container's json-file logs. R7 measured the backend emitting a +# per-request latency line at info (~14 MiB / 30 min under the 500-player stress +# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each +# container at 30 MiB. Applied to every service via the *default-logging alias. +x-logging: &default-logging + driver: json-file + options: + max-size: "10m" + max-file: "3" + services: postgres: container_name: scrabble-postgres image: postgres:17-alpine restart: unless-stopped + logging: *default-logging environment: POSTGRES_DB: ${POSTGRES_DB:-scrabble} POSTGRES_USER: ${POSTGRES_USER:-scrabble} @@ -57,12 +68,16 @@ services: args: DICT_VERSION: ${DICT_VERSION:-v1.0.0} restart: unless-stopped + logging: *default-logging depends_on: postgres: condition: service_healthy environment: # search_path=backend matches the migrations (00001 creates the schema). BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend + # R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players; + # 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it. + BACKEND_POSTGRES_MAX_OPEN_CONNS: "40" BACKEND_HTTP_ADDR: ":8080" BACKEND_GRPC_ADDR: ":9090" BACKEND_CONNECTOR_ADDR: telegram:9091 @@ -102,6 +117,7 @@ services: VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped + logging: *default-logging depends_on: [backend] environment: GATEWAY_HTTP_ADDR: ":8081" @@ -116,15 +132,16 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" # GOMAXPROCS matches the CPU limit below (see backend). - GOMAXPROCS: "2" + GOMAXPROCS: "3" # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. - # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after - # the final stress run. + # R7 tuned: the gateway holds one h2c connection per player, so at 500 players it + # bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs + # the bursts. Per-connection overhead is the realistic prod cost — size for it. deploy: resources: limits: - cpus: "2.0" + cpus: "3.0" memory: 512M networks: [internal] @@ -148,6 +165,7 @@ services: VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped + logging: *default-logging deploy: resources: limits: @@ -159,6 +177,7 @@ services: container_name: scrabble-telegram-vpn image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest restart: unless-stopped + logging: *default-logging privileged: true environment: AWG_CONF: ${AWG_CONF:?set AWG_CONF} @@ -173,6 +192,7 @@ services: context: .. dockerfile: platform/telegram/Dockerfile restart: unless-stopped + logging: *default-logging depends_on: [vpn] network_mode: "service:vpn" environment: @@ -212,6 +232,7 @@ services: container_name: scrabble-caddy image: caddy:2-alpine restart: unless-stopped + logging: *default-logging depends_on: [gateway, backend, grafana, landing] environment: # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME. @@ -235,6 +256,7 @@ services: container_name: scrabble-otelcol image: otel/opentelemetry-collector-contrib:0.119.0 restart: unless-stopped + logging: *default-logging command: ["--config=/etc/otelcol/config.yaml"] # The docker_stats receiver reads per-container metrics from the Docker API, so the # collector (image UID 10001) joins the host's docker group to read the socket — @@ -255,6 +277,7 @@ services: container_name: scrabble-prometheus image: prom/prometheus:v2.55.1 restart: unless-stopped + logging: *default-logging command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=15d @@ -271,21 +294,24 @@ services: container_name: scrabble-tempo image: grafana/tempo:2.7.1 restart: unless-stopped + logging: *default-logging command: ["-config.file=/etc/tempo/tempo.yaml"] volumes: - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - tempo-data:/var/tempo - # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run. + # R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2); + # raised to 2 GiB for headroom against OOM under sustained tracing load. deploy: resources: limits: - memory: 1G + memory: 2G networks: [internal] grafana: container_name: scrabble-grafana image: grafana/grafana:11.4.0 restart: unless-stopped + logging: *default-logging depends_on: [prometheus, tempo] environment: # Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a @@ -322,6 +348,7 @@ services: container_name: scrabble-postgres-exporter image: prometheuscommunity/postgres-exporter:v0.16.0 restart: unless-stopped + logging: *default-logging depends_on: [postgres] environment: DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable