R7: apply the agreed tuning from the final stress run
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 8s
CI / integration (pull_request) Successful in 12s
CI / ui (pull_request) Successful in 36s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Successful in 1m23s
Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge.
This commit is contained in:
@@ -23,11 +23,22 @@
|
|||||||
# (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
|
# (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`.
|
||||||
name: scrabble
|
name: scrabble
|
||||||
|
|
||||||
|
# Bound every container's json-file logs. R7 measured the backend emitting a
|
||||||
|
# per-request latency line at info (~14 MiB / 30 min under the 500-player stress
|
||||||
|
# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each
|
||||||
|
# container at 30 MiB. Applied to every service via the *default-logging alias.
|
||||||
|
x-logging: &default-logging
|
||||||
|
driver: json-file
|
||||||
|
options:
|
||||||
|
max-size: "10m"
|
||||||
|
max-file: "3"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
postgres:
|
postgres:
|
||||||
container_name: scrabble-postgres
|
container_name: scrabble-postgres
|
||||||
image: postgres:17-alpine
|
image: postgres:17-alpine
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
environment:
|
environment:
|
||||||
POSTGRES_DB: ${POSTGRES_DB:-scrabble}
|
POSTGRES_DB: ${POSTGRES_DB:-scrabble}
|
||||||
POSTGRES_USER: ${POSTGRES_USER:-scrabble}
|
POSTGRES_USER: ${POSTGRES_USER:-scrabble}
|
||||||
@@ -57,12 +68,16 @@ services:
|
|||||||
args:
|
args:
|
||||||
DICT_VERSION: ${DICT_VERSION:-v1.0.0}
|
DICT_VERSION: ${DICT_VERSION:-v1.0.0}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
environment:
|
environment:
|
||||||
# search_path=backend matches the migrations (00001 creates the schema).
|
# search_path=backend matches the migrations (00001 creates the schema).
|
||||||
BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
|
BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend
|
||||||
|
# R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players;
|
||||||
|
# 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it.
|
||||||
|
BACKEND_POSTGRES_MAX_OPEN_CONNS: "40"
|
||||||
BACKEND_HTTP_ADDR: ":8080"
|
BACKEND_HTTP_ADDR: ":8080"
|
||||||
BACKEND_GRPC_ADDR: ":9090"
|
BACKEND_GRPC_ADDR: ":9090"
|
||||||
BACKEND_CONNECTOR_ADDR: telegram:9091
|
BACKEND_CONNECTOR_ADDR: telegram:9091
|
||||||
@@ -102,6 +117,7 @@ services:
|
|||||||
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
||||||
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on: [backend]
|
depends_on: [backend]
|
||||||
environment:
|
environment:
|
||||||
GATEWAY_HTTP_ADDR: ":8081"
|
GATEWAY_HTTP_ADDR: ":8081"
|
||||||
@@ -116,15 +132,16 @@ services:
|
|||||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317
|
||||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||||
# GOMAXPROCS matches the CPU limit below (see backend).
|
# GOMAXPROCS matches the CPU limit below (see backend).
|
||||||
GOMAXPROCS: "2"
|
GOMAXPROCS: "3"
|
||||||
# GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
|
# GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front
|
||||||
# caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
|
# caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly.
|
||||||
# R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after
|
# R7 tuned: the gateway holds one h2c connection per player, so at 500 players it
|
||||||
# the final stress run.
|
# bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs
|
||||||
|
# the bursts. Per-connection overhead is the realistic prod cost — size for it.
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
cpus: "2.0"
|
cpus: "3.0"
|
||||||
memory: 512M
|
memory: 512M
|
||||||
networks: [internal]
|
networks: [internal]
|
||||||
|
|
||||||
@@ -148,6 +165,7 @@ services:
|
|||||||
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-}
|
||||||
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
VITE_APP_VERSION: ${APP_VERSION:-dev}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
@@ -159,6 +177,7 @@ services:
|
|||||||
container_name: scrabble-telegram-vpn
|
container_name: scrabble-telegram-vpn
|
||||||
image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
|
image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
privileged: true
|
privileged: true
|
||||||
environment:
|
environment:
|
||||||
AWG_CONF: ${AWG_CONF:?set AWG_CONF}
|
AWG_CONF: ${AWG_CONF:?set AWG_CONF}
|
||||||
@@ -173,6 +192,7 @@ services:
|
|||||||
context: ..
|
context: ..
|
||||||
dockerfile: platform/telegram/Dockerfile
|
dockerfile: platform/telegram/Dockerfile
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on: [vpn]
|
depends_on: [vpn]
|
||||||
network_mode: "service:vpn"
|
network_mode: "service:vpn"
|
||||||
environment:
|
environment:
|
||||||
@@ -212,6 +232,7 @@ services:
|
|||||||
container_name: scrabble-caddy
|
container_name: scrabble-caddy
|
||||||
image: caddy:2-alpine
|
image: caddy:2-alpine
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on: [gateway, backend, grafana, landing]
|
depends_on: [gateway, backend, grafana, landing]
|
||||||
environment:
|
environment:
|
||||||
# Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
|
# Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME.
|
||||||
@@ -235,6 +256,7 @@ services:
|
|||||||
container_name: scrabble-otelcol
|
container_name: scrabble-otelcol
|
||||||
image: otel/opentelemetry-collector-contrib:0.119.0
|
image: otel/opentelemetry-collector-contrib:0.119.0
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
command: ["--config=/etc/otelcol/config.yaml"]
|
command: ["--config=/etc/otelcol/config.yaml"]
|
||||||
# The docker_stats receiver reads per-container metrics from the Docker API, so the
|
# The docker_stats receiver reads per-container metrics from the Docker API, so the
|
||||||
# collector (image UID 10001) joins the host's docker group to read the socket —
|
# collector (image UID 10001) joins the host's docker group to read the socket —
|
||||||
@@ -255,6 +277,7 @@ services:
|
|||||||
container_name: scrabble-prometheus
|
container_name: scrabble-prometheus
|
||||||
image: prom/prometheus:v2.55.1
|
image: prom/prometheus:v2.55.1
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
command:
|
command:
|
||||||
- --config.file=/etc/prometheus/prometheus.yml
|
- --config.file=/etc/prometheus/prometheus.yml
|
||||||
- --storage.tsdb.retention.time=15d
|
- --storage.tsdb.retention.time=15d
|
||||||
@@ -271,21 +294,24 @@ services:
|
|||||||
container_name: scrabble-tempo
|
container_name: scrabble-tempo
|
||||||
image: grafana/tempo:2.7.1
|
image: grafana/tempo:2.7.1
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
command: ["-config.file=/etc/tempo/tempo.yaml"]
|
command: ["-config.file=/etc/tempo/tempo.yaml"]
|
||||||
volumes:
|
volumes:
|
||||||
- ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
|
- ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
|
||||||
- tempo-data:/var/tempo
|
- tempo-data:/var/tempo
|
||||||
# tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run.
|
# R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2);
|
||||||
|
# raised to 2 GiB for headroom against OOM under sustained tracing load.
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
memory: 1G
|
memory: 2G
|
||||||
networks: [internal]
|
networks: [internal]
|
||||||
|
|
||||||
grafana:
|
grafana:
|
||||||
container_name: scrabble-grafana
|
container_name: scrabble-grafana
|
||||||
image: grafana/grafana:11.4.0
|
image: grafana/grafana:11.4.0
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on: [prometheus, tempo]
|
depends_on: [prometheus, tempo]
|
||||||
environment:
|
environment:
|
||||||
# Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
|
# Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a
|
||||||
@@ -322,6 +348,7 @@ services:
|
|||||||
container_name: scrabble-postgres-exporter
|
container_name: scrabble-postgres-exporter
|
||||||
image: prometheuscommunity/postgres-exporter:v0.16.0
|
image: prometheuscommunity/postgres-exporter:v0.16.0
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
logging: *default-logging
|
||||||
depends_on: [postgres]
|
depends_on: [postgres]
|
||||||
environment:
|
environment:
|
||||||
DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable
|
DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable
|
||||||
|
|||||||
Reference in New Issue
Block a user