R7: apply the agreed tuning from the final stress run

Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge.
2026-06-11 10:33:58 +02:00
parent 8eee018728
commit f23da88028
1 changed files with 33 additions and 6 deletions
@@ -23,11 +23,22 @@
 #     (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
 name: scrabble

+# Bound every container's json-file logs. R7 measured the backend emitting a
+# per-request latency line at info (~14 MiB / 30 min under the 500-player stress
+# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each
+# container at 30 MiB. Applied to every service via the *default-logging alias.
+x-logging: &default-logging
+  driver: json-file
+  options:
+    max-size: "10m"
+    max-file: "3"
+
 services:
  postgres:
    container_name: scrabble-postgres
    image: postgres:17-alpine
    restart: unless-stopped
+    logging: *default-logging
    environment:
      POSTGRES_DB: ${POSTGRES_DB:-scrabble}
      POSTGRES_USER: ${POSTGRES_USER:-scrabble}
@@ -57,12 +68,16 @@ services:
      args:
        DICT_VERSION: ${DICT_VERSION:-v1.0.0}
    restart: unless-stopped
+    logging: *default-logging
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      # search_path=backend matches the migrations (00001 creates the schema).
      BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
+      # R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players;
+      # 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it.
+      BACKEND_POSTGRES_MAX_OPEN_CONNS: "40"
      BACKEND_HTTP_ADDR: ":8080"
      BACKEND_GRPC_ADDR: ":9090"
      BACKEND_CONNECTOR_ADDR: telegram:9091
@@ -102,6 +117,7 @@ services:
        VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
        VITE_APP_VERSION: ${APP_VERSION:-dev}
    restart: unless-stopped
+    logging: *default-logging
    depends_on: [backend]
    environment:
      GATEWAY_HTTP_ADDR: ":8081"
@@ -116,15 +132,16 @@ services:
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
      # GOMAXPROCS matches the CPU limit below (see backend).
-      GOMAXPROCS: "2"
+      GOMAXPROCS: "3"
      # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
      # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
-    # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after
-    # the final stress run.
+    # R7 tuned: the gateway holds one h2c connection per player, so at 500 players it
+    # bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs
+    # the bursts. Per-connection overhead is the realistic prod cost — size for it.
    deploy:
      resources:
        limits:
-          cpus: "2.0"
+          cpus: "3.0"
          memory: 512M
    networks: [internal]

@@ -148,6 +165,7 @@ services:
        VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
        VITE_APP_VERSION: ${APP_VERSION:-dev}
    restart: unless-stopped
+    logging: *default-logging
    deploy:
      resources:
        limits:
@@ -159,6 +177,7 @@ services:
    container_name: scrabble-telegram-vpn
    image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
    restart: unless-stopped
+    logging: *default-logging
    privileged: true
    environment:
      AWG_CONF: ${AWG_CONF:?set AWG_CONF}
@@ -173,6 +192,7 @@ services:
      context: ..
      dockerfile: platform/telegram/Dockerfile
    restart: unless-stopped
+    logging: *default-logging
    depends_on: [vpn]
    network_mode: "service:vpn"
    environment:
@@ -212,6 +232,7 @@ services:
    container_name: scrabble-caddy
    image: caddy:2-alpine
    restart: unless-stopped
+    logging: *default-logging
    depends_on: [gateway, backend, grafana, landing]
    environment:
      # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
@@ -235,6 +256,7 @@ services:
    container_name: scrabble-otelcol
    image: otel/opentelemetry-collector-contrib:0.119.0
    restart: unless-stopped
+    logging: *default-logging
    command: ["--config=/etc/otelcol/config.yaml"]
    # The docker_stats receiver reads per-container metrics from the Docker API, so the
    # collector (image UID 10001) joins the host's docker group to read the socket —
@@ -255,6 +277,7 @@ services:
    container_name: scrabble-prometheus
    image: prom/prometheus:v2.55.1
    restart: unless-stopped
+    logging: *default-logging
    command:
      - --config.file=/etc/prometheus/prometheus.yml
      - --storage.tsdb.retention.time=15d
@@ -271,21 +294,24 @@ services:
    container_name: scrabble-tempo
    image: grafana/tempo:2.7.1
    restart: unless-stopped
+    logging: *default-logging
    command: ["-config.file=/etc/tempo/tempo.yaml"]
    volumes:
      - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
      - tempo-data:/var/tempo
-    # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run.
+    # R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2);
+    # raised to 2 GiB for headroom against OOM under sustained tracing load.
    deploy:
      resources:
        limits:
-          memory: 1G
+          memory: 2G
    networks: [internal]

  grafana:
    container_name: scrabble-grafana
    image: grafana/grafana:11.4.0
    restart: unless-stopped
+    logging: *default-logging
    depends_on: [prometheus, tempo]
    environment:
      # Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
@@ -322,6 +348,7 @@ services:
    container_name: scrabble-postgres-exporter
    image: prometheuscommunity/postgres-exporter:v0.16.0
    restart: unless-stopped
+    logging: *default-logging
    depends_on: [postgres]
    environment:
      DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable