R7: apply the agreed tuning from the final stress run
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s
Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge.
This commit is contained in:
@@ -23,11 +23,22 @@
|
||||
# (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
|
||||
name: scrabble
|
||||
|
||||
# Bound every container's json-file logs. R7 measured the backend emitting a
|
||||
# per-request latency line at info (~14 MiB / 30 min under the 500-player stress
|
||||
# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each
|
||||
# container at 30 MiB. Applied to every service via the *default-logging alias.
|
||||
x-logging: &default-logging
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
services:
|
||||
postgres:
|
||||
container_name: scrabble-postgres
|
||||
image: postgres:17-alpine
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
environment:
|
||||
POSTGRES_DB: ${POSTGRES_DB:-scrabble}
|
||||
POSTGRES_USER: ${POSTGRES_USER:-scrabble}
|
||||
@@ -57,12 +68,16 @@ services:
|
||||
args:
|
||||
DICT_VERSION: ${DICT_VERSION:-v1.0.0}
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
# search_path=backend matches the migrations (00001 creates the schema).
|
||||
BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
|
||||
# R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players;
|
||||
# 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it.
|
||||
BACKEND_POSTGRES_MAX_OPEN_CONNS: "40"
|
||||
BACKEND_HTTP_ADDR: ":8080"
|
||||
BACKEND_GRPC_ADDR: ":9090"
|
||||
BACKEND_CONNECTOR_ADDR: telegram:9091
|
||||
@@ -102,6 +117,7 @@ services:
|
||||
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
||||
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on: [backend]
|
||||
environment:
|
||||
GATEWAY_HTTP_ADDR: ":8081"
|
||||
@@ -116,15 +132,16 @@ services:
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
# GOMAXPROCS matches the CPU limit below (see backend).
|
||||
GOMAXPROCS: "2"
|
||||
GOMAXPROCS: "3"
|
||||
# GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
|
||||
# caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
|
||||
# R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after
|
||||
# the final stress run.
|
||||
# R7 tuned: the gateway holds one h2c connection per player, so at 500 players it
|
||||
# bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs
|
||||
# the bursts. Per-connection overhead is the realistic prod cost — size for it.
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "2.0"
|
||||
cpus: "3.0"
|
||||
memory: 512M
|
||||
networks: [internal]
|
||||
|
||||
@@ -148,6 +165,7 @@ services:
|
||||
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
||||
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
@@ -159,6 +177,7 @@ services:
|
||||
container_name: scrabble-telegram-vpn
|
||||
image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
privileged: true
|
||||
environment:
|
||||
AWG_CONF: ${AWG_CONF:?set AWG_CONF}
|
||||
@@ -173,6 +192,7 @@ services:
|
||||
context: ..
|
||||
dockerfile: platform/telegram/Dockerfile
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on: [vpn]
|
||||
network_mode: "service:vpn"
|
||||
environment:
|
||||
@@ -212,6 +232,7 @@ services:
|
||||
container_name: scrabble-caddy
|
||||
image: caddy:2-alpine
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on: [gateway, backend, grafana, landing]
|
||||
environment:
|
||||
# Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
|
||||
@@ -235,6 +256,7 @@ services:
|
||||
container_name: scrabble-otelcol
|
||||
image: otel/opentelemetry-collector-contrib:0.119.0
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
command: ["--config=/etc/otelcol/config.yaml"]
|
||||
# The docker_stats receiver reads per-container metrics from the Docker API, so the
|
||||
# collector (image UID 10001) joins the host's docker group to read the socket —
|
||||
@@ -255,6 +277,7 @@ services:
|
||||
container_name: scrabble-prometheus
|
||||
image: prom/prometheus:v2.55.1
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time=15d
|
||||
@@ -271,21 +294,24 @@ services:
|
||||
container_name: scrabble-tempo
|
||||
image: grafana/tempo:2.7.1
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
command: ["-config.file=/etc/tempo/tempo.yaml"]
|
||||
volumes:
|
||||
- ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
|
||||
- tempo-data:/var/tempo
|
||||
# tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run.
|
||||
# R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2);
|
||||
# raised to 2 GiB for headroom against OOM under sustained tracing load.
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
memory: 2G
|
||||
networks: [internal]
|
||||
|
||||
grafana:
|
||||
container_name: scrabble-grafana
|
||||
image: grafana/grafana:11.4.0
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on: [prometheus, tempo]
|
||||
environment:
|
||||
# Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
|
||||
@@ -322,6 +348,7 @@ services:
|
||||
container_name: scrabble-postgres-exporter
|
||||
image: prometheuscommunity/postgres-exporter:v0.16.0
|
||||
restart: unless-stopped
|
||||
logging: *default-logging
|
||||
depends_on: [postgres]
|
||||
environment:
|
||||
DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable
|
||||
|
||||
Reference in New Issue
Block a user