From 04263a17cae2091845719bd64a9e027ee6083ee3 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Wed, 10 Jun 2026 18:53:07 +0200 Subject: [PATCH 1/6] R7: per-player transports + drop finished games in the load harness Each virtual player now builds its own edge.Client (its own h2c connection carrying both the Subscribe stream and the Execute calls), instead of every player multiplexing over a single shared http2.Transport. The R2 trip report traced the ~14% transport_error on game.state at 500 players to that single shared transport; per-player connections mirror real clients and isolate the artifact. The assembly burst and the gateway-hammer each get their own client. playTurn now reports when a game has finished so playerLoop drops it from the rotation (slices.DeleteFunc); once no active game remains the player idles while still holding its stream. This stops secondary ops from hammering game_finished on already-ended games (the other R2 harness finding). --- loadtest/cmd/loadtest/main.go | 3 +- loadtest/internal/edge/client.go | 10 +- loadtest/internal/scenario/assemble.go | 13 +-- loadtest/internal/scenario/hammer.go | 4 +- loadtest/internal/scenario/scenario.go | 130 +++++++++++++++---------- 5 files changed, 95 insertions(+), 65 deletions(-) diff --git a/loadtest/cmd/loadtest/main.go b/loadtest/cmd/loadtest/main.go index 9d58798..aaec899 100644 --- a/loadtest/cmd/loadtest/main.go +++ b/loadtest/cmd/loadtest/main.go @@ -24,7 +24,6 @@ import ( "syscall" "time" - "scrabble/loadtest/internal/edge" "scrabble/loadtest/internal/moves" "scrabble/loadtest/internal/report" "scrabble/loadtest/internal/scenario" @@ -114,7 +113,7 @@ func cmdRun(ctx context.Context, log *slog.Logger, args []string) error { log.Info("seeded", "durable", len(pool.Durables), "guest", len(pool.Guests)) rec := report.New() - drv := scenario.NewDriver(edge.New(*gateway), reg, rec, log) + drv := scenario.NewDriver(*gateway, reg, rec, log) cfg := scenario.RealisticConfig{ Steps: steps, StepDur: *stepDur, GamesPerPlayer: *gpp, Tick: *tick, SecondaryProb: *secProb, diff --git a/loadtest/internal/edge/client.go b/loadtest/internal/edge/client.go index a40ec8d..9b7d41b 100644 --- a/loadtest/internal/edge/client.go +++ b/loadtest/internal/edge/client.go @@ -41,16 +41,18 @@ const ( msgEnqueue = "lobby.enqueue" ) -// Client speaks the edge protocol to a single gateway base URL over h2c. It is safe -// for concurrent use by many virtual players (the underlying http2.Transport pools -// and multiplexes connections). +// Client speaks the edge protocol to a single gateway base URL over h2c. The harness +// builds one Client per virtual player, so each player owns its h2c connection (its +// Subscribe stream and Execute calls share it) the way a real client does; a single +// Client is safe for that player's own concurrent goroutines. type Client struct { rpc edgev1connect.GatewayClient } // New builds a Client for baseURL (for example http://gateway:8081). The transport // speaks HTTP/2 cleartext (h2c) to match the gateway, dialling plaintext TCP rather -// than TLS. +// than TLS. Each virtual player gets its own Client (hence its own connection), so the +// load mirrors real clients instead of multiplexing every player over one transport. func New(baseURL string) *Client { hc := &http.Client{ Transport: &http2.Transport{ diff --git a/loadtest/internal/scenario/assemble.go b/loadtest/internal/scenario/assemble.go index 33584c6..7cc9ece 100644 --- a/loadtest/internal/scenario/assemble.go +++ b/loadtest/internal/scenario/assemble.go @@ -37,6 +37,7 @@ func (d *Driver) assembleCohort(ctx context.Context, cohort []seed.Account, game if len(cohort) < 2 { return nil } + c := edge.New(d.gateway) // one client for the assembly burst; players play on their own gamesOf := make(map[string]int, len(cohort)) var games []*Game for i := range cohort { @@ -51,7 +52,7 @@ func (d *Driver) assembleCohort(ctx context.Context, cohort []seed.Account, game break } variant := moves.Variants()[rng.Intn(len(moves.Variants()))] - g, err := d.assemble(ctx, members, variant) + g, err := d.assemble(ctx, c, members, variant) if err != nil { d.log.Debug("assemble game", "err", err) break @@ -85,7 +86,7 @@ func pickMembers(cohort []seed.Account, inviter seed.Account, rng *rand.Rand) [] // assemble runs the invitation flow for one game: the inviter (members[0]) invites // the rest, each invitee accepts the pending invitation, and the completing accept // starts the game, which is then located in the inviter's game list. -func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant string) (*Game, error) { +func (d *Driver) assemble(ctx context.Context, c *edge.Client, members []seed.Account, variant string) (*Game, error) { inviter := members[0] inviteeIDs := make([]string, len(members)-1) for i, m := range members[1:] { @@ -93,7 +94,7 @@ func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant s } t0 := time.Now() - code, err := d.edge.CreateInvitation(ctx, inviter.Token, inviteeIDs, variant) + code, err := c.CreateInvitation(ctx, inviter.Token, inviteeIDs, variant) d.rec.Record("invitation.create", code, time.Since(t0)) if err != nil || code != "ok" { return nil, fmt.Errorf("invitation.create: %s", code) @@ -101,7 +102,7 @@ func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant s for _, invitee := range members[1:] { t0 = time.Now() - list, lc, err := d.edge.ListInvitations(ctx, invitee.Token) + list, lc, err := c.ListInvitations(ctx, invitee.Token) d.rec.Record("invitation.list", lc, time.Since(t0)) if err != nil || lc != "ok" { return nil, fmt.Errorf("invitation.list: %s", lc) @@ -111,7 +112,7 @@ func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant s return nil, fmt.Errorf("no pending invitation from %s", inviter.ID) } t0 = time.Now() - ac, err := d.edge.AcceptInvitation(ctx, invitee.Token, invID) + ac, err := c.AcceptInvitation(ctx, invitee.Token, invID) d.rec.Record("invitation.accept", ac, time.Since(t0)) if err != nil || ac != "ok" { return nil, fmt.Errorf("invitation.accept: %s", ac) @@ -119,7 +120,7 @@ func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant s } t0 = time.Now() - games, gc, err := d.edge.GamesList(ctx, inviter.Token) + games, gc, err := c.GamesList(ctx, inviter.Token) d.rec.Record("games.list", gc, time.Since(t0)) if err != nil || gc != "ok" { return nil, fmt.Errorf("games.list: %s", gc) diff --git a/loadtest/internal/scenario/hammer.go b/loadtest/internal/scenario/hammer.go index 18e51a6..a6be366 100644 --- a/loadtest/internal/scenario/hammer.go +++ b/loadtest/internal/scenario/hammer.go @@ -5,6 +5,7 @@ import ( "sync" "time" + "scrabble/loadtest/internal/edge" "scrabble/loadtest/internal/seed" ) @@ -29,6 +30,7 @@ func (d *Driver) Hammer(ctx context.Context, acc seed.Account, cfg HammerConfig) runCtx, cancel := context.WithTimeout(ctx, cfg.Duration) defer cancel() d.log.Info("gateway-hammer", "workers", cfg.Workers, "duration", cfg.Duration) + c := edge.New(d.gateway) var wg sync.WaitGroup for w := 0; w < cfg.Workers; w++ { wg.Add(1) @@ -36,7 +38,7 @@ func (d *Driver) Hammer(ctx context.Context, acc seed.Account, cfg HammerConfig) defer wg.Done() for runCtx.Err() == nil { t0 := time.Now() - _, code, _ := d.edge.GamesList(runCtx, acc.Token) + _, code, _ := c.GamesList(runCtx, acc.Token) d.rec.Record("hammer:games.list", code, time.Since(t0)) } }() diff --git a/loadtest/internal/scenario/scenario.go b/loadtest/internal/scenario/scenario.go index ccd964a..e86a8ca 100644 --- a/loadtest/internal/scenario/scenario.go +++ b/loadtest/internal/scenario/scenario.go @@ -9,6 +9,7 @@ import ( "context" "log/slog" "math/rand" + "slices" "sync" "time" @@ -18,18 +19,20 @@ import ( "scrabble/loadtest/internal/seed" ) -// Driver ties the edge client, the local move generator and the run recorder -// together. All three are safe for concurrent use by many player goroutines. +// Driver ties the gateway endpoint, the local move generator and the run recorder +// together. It builds one edge client per virtual player, so each player owns its +// h2c connection (its Subscribe stream and Execute calls share it) the way a real +// client does, rather than multiplexing every player over a single shared transport. type Driver struct { - edge *edge.Client - moves *moves.Registry - rec *report.Recorder - log *slog.Logger + gateway string // gateway base URL, e.g. http://gateway:8081 + moves *moves.Registry + rec *report.Recorder + log *slog.Logger } -// NewDriver builds a Driver. -func NewDriver(c *edge.Client, m *moves.Registry, rec *report.Recorder, log *slog.Logger) *Driver { - return &Driver{edge: c, moves: m, rec: rec, log: log} +// NewDriver builds a Driver targeting the gateway base URL. +func NewDriver(gateway string, m *moves.Registry, rec *report.Recorder, log *slog.Logger) *Driver { + return &Driver{gateway: gateway, moves: m, rec: rec, log: log} } // RealisticConfig parameterises the under-the-limit ramp. @@ -98,11 +101,16 @@ func (d *Driver) RunRealistic(ctx context.Context, pool *seed.Pool, cfg Realisti return nil } -// playerLoop runs one virtual player: a live-event subscription (loads the push hub, -// counts events) plus a round-robin turn loop over the player's games. +// playerLoop runs one virtual player over its own edge client (its own h2c +// connection): a live-event subscription (loads the push hub, counts events) plus a +// round-robin turn loop over the player's games. A game that has finished is dropped +// from the rotation so secondary ops stop hitting an ended game; once no active game +// remains the player idles, still holding its stream, until the run ends. func (d *Driver) playerLoop(ctx context.Context, p seed.Account, games []*Game, cfg RealisticConfig, rng *rand.Rand) { - go d.subscribeLoop(ctx, p) - if len(games) == 0 { + c := edge.New(d.gateway) + go d.subscribeLoop(ctx, c, p) + active := games + if len(active) == 0 { <-ctx.Done() return } @@ -114,22 +122,30 @@ func (d *Driver) playerLoop(ctx context.Context, p seed.Account, games []*Game, case <-ctx.Done(): return case <-ticker.C: - g := games[gi%len(games)] + g := active[gi%len(active)] gi++ if rng.Float64() < cfg.SecondaryProb { - d.secondaryOp(ctx, p, g, rng) + d.secondaryOp(ctx, c, p, g, rng) continue } - d.playTurn(ctx, p, g, rng) + if d.playTurn(ctx, c, p, g, rng) { + active = slices.DeleteFunc(active, func(x *Game) bool { return x == g }) + gi = 0 + if len(active) == 0 { + <-ctx.Done() + return + } + } } } } -// subscribeLoop holds the player's live-event stream open, counting events and -// reconnecting with a brief backoff after a drop, until the run ends. -func (d *Driver) subscribeLoop(ctx context.Context, p seed.Account) { +// subscribeLoop holds the player's live-event stream open on the player's client, +// counting events and reconnecting with a brief backoff after a drop, until the run +// ends. +func (d *Driver) subscribeLoop(ctx context.Context, c *edge.Client, p seed.Account) { for ctx.Err() == nil { - err := d.edge.Subscribe(ctx, p.Token, func(e edge.Event) { d.rec.Event(e.Kind) }) + err := c.Subscribe(ctx, p.Token, func(e edge.Event) { d.rec.Event(e.Kind) }) if ctx.Err() != nil { return } @@ -144,80 +160,90 @@ func (d *Driver) subscribeLoop(ctx context.Context, p seed.Account) { } } -// playTurn plays one turn in g when it is the player's move: fetch state, replay -// history, pick a legal move and submit it (or exchange / pass). -func (d *Driver) playTurn(ctx context.Context, p seed.Account, g *Game, rng *rand.Rand) { +// playTurn plays one turn in g over the player's client when it is the player's +// move: fetch state, replay history, pick a legal move and submit it (or exchange / +// pass). It reports whether the game has finished, so the caller can drop it from the +// rotation. +func (d *Driver) playTurn(ctx context.Context, c *edge.Client, p seed.Account, g *Game, rng *rand.Rand) (finished bool) { seat := g.seatOf(p.ID.String()) if seat < 0 { - return + return false } t0 := time.Now() - st, code, err := d.edge.State(ctx, p.Token, g.ID) + st, code, err := c.State(ctx, p.Token, g.ID) d.rec.Record("game.state", code, time.Since(t0)) - if err != nil || code != "ok" || !st.Game.Active() || st.Game.ToMove != seat { - return + if err != nil || code != "ok" { + return false + } + if !st.Game.Active() { + return true + } + if st.Game.ToMove != seat { + return false } t0 = time.Now() - hist, hc, err := d.edge.History(ctx, p.Token, g.ID) + hist, hc, err := c.History(ctx, p.Token, g.ID) d.rec.Record("game.history", hc, time.Since(t0)) if err != nil || hc != "ok" { - return + return false } action, err := d.moves.Pick(g.Variant, hist, st.Rack, st.BagLen, rng) if err != nil { d.log.Debug("pick move", "variant", g.Variant, "err", err) - return + return false } switch action.Kind { case "play": t0 = time.Now() - _, c, _ := d.edge.SubmitPlay(ctx, p.Token, g.ID, action.Dir, action.Tiles) - d.rec.Record("game.submit_play", c, time.Since(t0)) + _, code, _ := c.SubmitPlay(ctx, p.Token, g.ID, action.Dir, action.Tiles) + d.rec.Record("game.submit_play", code, time.Since(t0)) case "exchange": t0 = time.Now() - _, c, _ := d.edge.Exchange(ctx, p.Token, g.ID, action.Exchange) - d.rec.Record("game.exchange", c, time.Since(t0)) + _, code, _ := c.Exchange(ctx, p.Token, g.ID, action.Exchange) + d.rec.Record("game.exchange", code, time.Since(t0)) default: t0 = time.Now() - _, c, _ := d.edge.Pass(ctx, p.Token, g.ID) - d.rec.Record("game.pass", c, time.Since(t0)) + _, code, _ := c.Pass(ctx, p.Token, g.ID) + d.rec.Record("game.pass", code, time.Since(t0)) } + return false } // secondaryOp exercises one of the non-move edge operations the plan calls out, so -// the run touches nudge / chat / check-word / draft / profile / stats too. -func (d *Driver) secondaryOp(ctx context.Context, p seed.Account, g *Game, rng *rand.Rand) { +// the run touches nudge / chat / check-word / draft / profile / stats too, over the +// player's own client. +func (d *Driver) secondaryOp(ctx context.Context, c *edge.Client, p seed.Account, g *Game, rng *rand.Rand) { t0 := time.Now() switch rng.Intn(7) { case 0: - c, _ := d.edge.Nudge(ctx, p.Token, g.ID) - d.rec.Record("chat.nudge", c, time.Since(t0)) + code, _ := c.Nudge(ctx, p.Token, g.ID) + d.rec.Record("chat.nudge", code, time.Since(t0)) case 1: - c, _ := d.edge.ChatPost(ctx, p.Token, g.ID, "gg") - d.rec.Record("chat.post", c, time.Since(t0)) + code, _ := c.ChatPost(ctx, p.Token, g.ID, "gg") + d.rec.Record("chat.post", code, time.Since(t0)) case 2: - c, _ := d.edge.CheckWord(ctx, p.Token, g.ID, []byte{0, 1, 2}) - d.rec.Record("game.check_word", c, time.Since(t0)) + code, _ := c.CheckWord(ctx, p.Token, g.ID, []byte{0, 1, 2}) + d.rec.Record("game.check_word", code, time.Since(t0)) case 3: // rack_order is an opaque string and board_tiles a (here empty) array, per the // backend draft DTO; a malformed shape is rejected as bad_request. - c, _ := d.edge.DraftSave(ctx, p.Token, g.ID, `{"rack_order":"","board_tiles":[]}`) - d.rec.Record("draft.save", c, time.Since(t0)) + code, _ := c.DraftSave(ctx, p.Token, g.ID, `{"rack_order":"","board_tiles":[]}`) + d.rec.Record("draft.save", code, time.Since(t0)) case 4: - c, _ := d.edge.DraftGet(ctx, p.Token, g.ID) - d.rec.Record("draft.get", c, time.Since(t0)) + code, _ := c.DraftGet(ctx, p.Token, g.ID) + d.rec.Record("draft.get", code, time.Since(t0)) case 5: lang := "en" if rng.Intn(2) == 1 { lang = "ru" } - c, _ := d.edge.ProfileUpdate(ctx, p.Token, p.Name, lang) - d.rec.Record("profile.update", c, time.Since(t0)) + code, _ := c.ProfileUpdate(ctx, p.Token, p.Name, lang) + d.rec.Record("profile.update", code, time.Since(t0)) default: - c, _ := d.edge.Stats(ctx, p.Token) - d.rec.Record("stats.get", c, time.Since(t0)) + code, _ := c.Stats(ctx, p.Token) + d.rec.Record("stats.get", code, time.Since(t0)) } } From c16f27475f7b557ba200066d6917d05c5660c6b8 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Wed, 10 Jun 2026 18:53:19 +0200 Subject: [PATCH 2/6] R7: contour docker_stats observability + container limits/GOMAXPROCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observability: replace cAdvisor (which resolves only the root cgroup on the contour host — separate-XFS /var/lib/docker) with the otelcol docker_stats receiver, which reads per-container CPU/memory/network straight from the Docker API and works the same in prod. The collector joins the host docker group (DOCKER_GID, default 989) and mounts the socket read-only; its metrics flow out through the existing prometheus exporter, so the cAdvisor scrape job and the privileged cAdvisor service are removed. The Resources dashboard panels are retargeted to the docker_stats metric names (container_name label; container.cpu.utilization/100 == cores). Container limits: apply deploy.resources.limits (honoured by Compose v2) across the contour and pin GOMAXPROCS to the CPU limit on the Go services so the runtime matches the cgroup quota. Starting values are generous over the R2 peak (~1 core / <=100 MiB per app service) to avoid skewing or OOM-killing the measurement run; they are tightened to the agreed prod sizing after the final stress run (R7 Round 2). The privileged VPN sidecar is left unconstrained. --- deploy/docker-compose.yml | 90 ++++++++++++++++++------ deploy/grafana/dashboards/resources.json | 18 ++--- deploy/otelcol/config.yaml | 14 +++- deploy/prometheus/prometheus.yml | 7 +- 4 files changed, 94 insertions(+), 35 deletions(-) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 706c1a0..038c156 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -39,6 +39,13 @@ services: retries: 30 volumes: - postgres-data:/var/lib/postgresql/data + # R7 starting limits: 512M leaves headroom over the default 128 MB shared_buffers + + # per-connection memory (R2 peaked at 28 backends / 69 MiB RSS); tighten after the run. + deploy: + resources: + limits: + cpus: "2.0" + memory: 512M networks: [internal] backend: @@ -65,8 +72,19 @@ services: BACKEND_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" + # GOMAXPROCS matches the CPU limit below so the Go scheduler aligns with the + # cgroup quota (the runtime otherwise sees all of the host's cores). + GOMAXPROCS: "2" # No container healthcheck: the distroless image has no shell/wget. Readiness # is covered by the CI post-deploy probe (GET / through caddy). + # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tightened to + # the agreed prod values after the final stress run. deploy.resources.limits is + # honoured by `docker compose up` (Compose v2), not only by swarm. + deploy: + resources: + limits: + cpus: "2.0" + memory: 512M networks: [internal] gateway: @@ -97,8 +115,17 @@ services: GATEWAY_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" + # GOMAXPROCS matches the CPU limit below (see backend). + GOMAXPROCS: "2" # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. + # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after + # the final stress run. + deploy: + resources: + limits: + cpus: "2.0" + memory: 512M networks: [internal] # --- Landing (static) ------------------------------------------------------- @@ -121,6 +148,10 @@ services: VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped + deploy: + resources: + limits: + memory: 128M networks: [internal] # --- Telegram connector (egress via the VPN sidecar) ----------------------- @@ -167,6 +198,13 @@ services: TELEGRAM_OTEL_METRICS_EXPORTER: otlp OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" + # The connector is light (the stress run does not drive Telegram); one P suffices. + GOMAXPROCS: "1" + deploy: + resources: + limits: + cpus: "1.0" + memory: 256M # --- Edge reverse proxy (single /_gm Basic-Auth; SPA + Connect -> gateway; # the catch-all incl. the landing -> the static landing container) ------- @@ -183,6 +221,10 @@ services: volumes: - ${SCRABBLE_CONFIG_DIR:-.}/caddy/Caddyfile:/etc/caddy/Caddyfile:ro - caddy-data:/data + deploy: + resources: + limits: + memory: 128M networks: internal: {} edge: @@ -194,8 +236,19 @@ services: image: otel/opentelemetry-collector-contrib:0.119.0 restart: unless-stopped command: ["--config=/etc/otelcol/config.yaml"] + # The docker_stats receiver reads per-container metrics from the Docker API, so the + # collector (image UID 10001) joins the host's docker group to read the socket — + # DOCKER_GID defaults to the contour host's 989; set it for other hosts (prod). The + # socket is mounted read-only. This replaces cAdvisor, whose per-container metrics + # are empty on this host (separate-XFS /var/lib/docker). + group_add: ["${DOCKER_GID:-989}"] volumes: - ${SCRABBLE_CONFIG_DIR:-.}/otelcol/config.yaml:/etc/otelcol/config.yaml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + deploy: + resources: + limits: + memory: 512M networks: [internal] prometheus: @@ -208,6 +261,10 @@ services: volumes: - ${SCRABBLE_CONFIG_DIR:-.}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus-data:/prometheus + deploy: + resources: + limits: + memory: 512M networks: [internal] tempo: @@ -218,6 +275,11 @@ services: volumes: - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - tempo-data:/var/tempo + # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run. + deploy: + resources: + limits: + memory: 1G networks: [internal] grafana: @@ -247,26 +309,10 @@ services: # provider logs "no such file or directory"). - ${SCRABBLE_CONFIG_DIR:-.}/grafana/dashboards:/etc/grafana/dashboards:ro - grafana-data:/var/lib/grafana - networks: [internal] - - # cAdvisor exports per-container resource metrics (CPU / memory / network / disk) - # for the R2/R7 stress runs' resource baseline. Prometheus scrapes it at :8080 - # over the internal network. It needs read access to the host's cgroup and - # container state; --docker_only trims non-container cgroup series. - cadvisor: - container_name: scrabble-cadvisor - image: gcr.io/cadvisor/cadvisor:v0.49.1 - restart: unless-stopped - privileged: true - command: ["--docker_only=true", "--housekeeping_interval=15s"] - devices: - - /dev/kmsg - volumes: - - /:/rootfs:ro - - /var/run:/var/run:ro - - /sys:/sys:ro - - /var/lib/docker/:/var/lib/docker:ro - - /dev/disk/:/dev/disk:ro + deploy: + resources: + limits: + memory: 512M networks: [internal] # postgres_exporter exports Postgres server metrics (connections, cache hit ratio, @@ -279,6 +325,10 @@ services: depends_on: [postgres] environment: DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable + deploy: + resources: + limits: + memory: 128M networks: [internal] networks: diff --git a/deploy/grafana/dashboards/resources.json b/deploy/grafana/dashboards/resources.json index 425f88d..a1d503e 100644 --- a/deploy/grafana/dashboards/resources.json +++ b/deploy/grafana/dashboards/resources.json @@ -4,7 +4,7 @@ "tags": ["scrabble"], "timezone": "", "schemaVersion": 39, - "version": 1, + "version": 2, "refresh": "30s", "time": { "from": "now-1h", "to": "now" }, "panels": [ @@ -43,30 +43,30 @@ { "type": "timeseries", "title": "Container CPU (cores) by container", - "description": "cAdvisor container_cpu_usage_seconds_total rate, per scrabble-* container (the load harness appears when run as --name scrabble-loadtest). Verify the metric name against live Prometheus if empty.", + "description": "docker_stats container.cpu.utilization (a gauge where 100 == one core) / 100, per scrabble-* container; the load harness appears when run as --name scrabble-loadtest. Verify the scaling against live Prometheus.", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "targets": [{ "refId": "A", "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "{{name}}" }] + "targets": [{ "refId": "A", "expr": "max(container_cpu_utilization{container_name=~\"scrabble-.+\"}) by (container_name) / 100", "legendFormat": "{{container_name}}" }] }, { "type": "timeseries", - "title": "Container memory (working set) by container", - "description": "cAdvisor container_memory_working_set_bytes, per scrabble-* container.", + "title": "Container memory (usage) by container", + "description": "docker_stats container.memory.usage.total bytes, per scrabble-* container.", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }, "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "targets": [{ "refId": "A", "expr": "max(container_memory_working_set_bytes{name=~\"scrabble-.+\"}) by (name)", "legendFormat": "{{name}}" }] + "targets": [{ "refId": "A", "expr": "max(container_memory_usage_total{container_name=~\"scrabble-.+\"}) by (container_name)", "legendFormat": "{{container_name}}" }] }, { "type": "timeseries", "title": "Container network I/O by container", - "description": "cAdvisor receive (+) and transmit (-) byte rates per scrabble-* container.", + "description": "docker_stats receive (+) and transmit (-) byte rates per scrabble-* container (summed across interfaces).", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "fieldConfig": { "defaults": { "unit": "Bps" }, "overrides": [] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "targets": [ - { "refId": "A", "expr": "sum(rate(container_network_receive_bytes_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "rx {{name}}" }, - { "refId": "B", "expr": "-sum(rate(container_network_transmit_bytes_total{name=~\"scrabble-.+\"}[5m])) by (name)", "legendFormat": "tx {{name}}" } + { "refId": "A", "expr": "sum(rate(container_network_io_usage_rx_bytes{container_name=~\"scrabble-.+\"}[5m])) by (container_name)", "legendFormat": "rx {{container_name}}" }, + { "refId": "B", "expr": "-sum(rate(container_network_io_usage_tx_bytes{container_name=~\"scrabble-.+\"}[5m])) by (container_name)", "legendFormat": "tx {{container_name}}" } ] }, { diff --git a/deploy/otelcol/config.yaml b/deploy/otelcol/config.yaml index 8a0e1f4..3e814d4 100644 --- a/deploy/otelcol/config.yaml +++ b/deploy/otelcol/config.yaml @@ -6,6 +6,18 @@ receivers: protocols: grpc: endpoint: 0.0.0.0:4317 + # Per-container resource metrics (CPU / memory / network) read straight from the + # Docker API. This replaces cAdvisor, which on the contour host resolves only the + # root cgroup (its /var/lib/docker is a separate XFS mount), and works the same in + # prod. The collector reaches the socket via group_add in docker-compose.yml. + # collection_interval matches Prometheus' 30s scrape. container.cpu.utilization is a + # gauge where 100 == one core (it mirrors `docker stats` CPU%). + docker_stats: + endpoint: unix:///var/run/docker.sock + collection_interval: 30s + metrics: + container.cpu.utilization: + enabled: true processors: batch: {} @@ -33,6 +45,6 @@ service: processors: [batch] exporters: [otlp/tempo] metrics: - receivers: [otlp] + receivers: [otlp, docker_stats] processors: [batch] exporters: [prometheus] diff --git a/deploy/prometheus/prometheus.yml b/deploy/prometheus/prometheus.yml index 1cecf13..71d6cd8 100644 --- a/deploy/prometheus/prometheus.yml +++ b/deploy/prometheus/prometheus.yml @@ -6,17 +6,14 @@ global: evaluation_interval: 30s scrape_configs: + # otelcol exposes both the services' OTLP metrics and the docker_stats receiver's + # per-container resource metrics (CPU/memory/network) on one endpoint. - job_name: otelcol static_configs: - targets: ["otelcol:9464"] - job_name: prometheus static_configs: - targets: ["localhost:9090"] - # Container resource metrics (CPU/memory/network/disk) for every contour - # container, for the R2/R7 stress runs' resource baseline. - - job_name: cadvisor - static_configs: - - targets: ["cadvisor:8080"] # Postgres server metrics (connections, cache hit ratio, transactions, db size). - job_name: postgres_exporter static_configs: From 8eee01872840003e7d9c89ca7e81e90f18ee3ca6 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Wed, 10 Jun 2026 18:58:55 +0200 Subject: [PATCH 3/6] R7: pin docker_stats api_version to 1.44 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The receiver defaults to Docker API 1.25, but the contour daemon's minimum is 1.40 (it speaks up to 1.54), so otelcol crash-looped on start with "client version 1.25 is too old". Pinning api_version to 1.44 (accepted by both the receiver's bundled client and the daemon) starts the receiver cleanly — verified by running the image against the host socket ("Everything is ready", no start error). --- deploy/otelcol/config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/otelcol/config.yaml b/deploy/otelcol/config.yaml index 3e814d4..f5b5485 100644 --- a/deploy/otelcol/config.yaml +++ b/deploy/otelcol/config.yaml @@ -14,6 +14,10 @@ receivers: # gauge where 100 == one core (it mirrors `docker stats` CPU%). docker_stats: endpoint: unix:///var/run/docker.sock + # The receiver defaults to Docker API 1.25, but the daemon's minimum is 1.40 + # (server speaks up to 1.54); pin a version both the receiver's client and the + # daemon accept, else the receiver fails to start ("client version too old"). + api_version: "1.44" collection_interval: 30s metrics: container.cpu.utilization: From f23da88028bf64d619f54fcddf44ba4234901373 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 11 Jun 2026 10:33:58 +0200 Subject: [PATCH 4/6] R7: apply the agreed tuning from the final stress run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 tuning, decided from the 500-player resource profile: - gateway: 2 -> 3 cores + GOMAXPROCS=3. It holds one h2c connection per player, so at 500 players it burst into the 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs the bursts. The per-connection cost is the realistic prod load. - tempo: memory 1G -> 2G. It reached the 1 GiB cap during the run (OOM risk). - backend Postgres pool: MAX_OPEN_CONNS 25 -> 40. The pool sat at its 25-conn cap (28 backends) at peak; headroom trims the p99 tail. Postgres (2c/512M) handles it. - docker log volume: a json-file rotation default (10m x 3 = 30 MiB/container) applied contour-wide via a YAML anchor; the backend logs ~14 MiB / 30 min at info under load and was previously unbounded. Log level stays info. backend/postgres stay at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom is cheap on the shared host). A validation re-run confirms the gateway fix before merge. --- deploy/docker-compose.yml | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 038c156..5702f17 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -23,11 +23,22 @@ # (network_mode: service:vpn); it answers internal gRPC at `telegram:9091`. name: scrabble +# Bound every container's json-file logs. R7 measured the backend emitting a +# per-request latency line at info (~14 MiB / 30 min under the 500-player stress +# peak); without rotation the volume grows unbounded. 10 MiB x 3 files caps each +# container at 30 MiB. Applied to every service via the *default-logging alias. +x-logging: &default-logging + driver: json-file + options: + max-size: "10m" + max-file: "3" + services: postgres: container_name: scrabble-postgres image: postgres:17-alpine restart: unless-stopped + logging: *default-logging environment: POSTGRES_DB: ${POSTGRES_DB:-scrabble} POSTGRES_USER: ${POSTGRES_USER:-scrabble} @@ -57,12 +68,16 @@ services: args: DICT_VERSION: ${DICT_VERSION:-v1.0.0} restart: unless-stopped + logging: *default-logging depends_on: postgres: condition: service_healthy environment: # search_path=backend matches the migrations (00001 creates the schema). BACKEND_POSTGRES_DSN: postgres://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable&search_path=backend + # R7 tuned: the pool sat at its 25-conn cap (28 backends total) at 500 players; + # 40 gives headroom for bursts. Postgres (2 cores / 512 MiB) handles it. + BACKEND_POSTGRES_MAX_OPEN_CONNS: "40" BACKEND_HTTP_ADDR: ":8080" BACKEND_GRPC_ADDR: ":9090" BACKEND_CONNECTOR_ADDR: telegram:9091 @@ -102,6 +117,7 @@ services: VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped + logging: *default-logging depends_on: [backend] environment: GATEWAY_HTTP_ADDR: ":8081" @@ -116,15 +132,16 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: http://otelcol:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" # GOMAXPROCS matches the CPU limit below (see backend). - GOMAXPROCS: "2" + GOMAXPROCS: "3" # GATEWAY_ADMIN_* intentionally unset: in the deployed contour the front # caddy owns the /_gm Basic-Auth and routes /_gm to the backend directly. - # R7 starting limits (generous over the R2 ~1-core / <=100 MiB peak); tighten after - # the final stress run. + # R7 tuned: the gateway holds one h2c connection per player, so at 500 players it + # bursts into a 2-core cap (~2.49% transport_error on game.state); 3 cores absorbs + # the bursts. Per-connection overhead is the realistic prod cost — size for it. deploy: resources: limits: - cpus: "2.0" + cpus: "3.0" memory: 512M networks: [internal] @@ -148,6 +165,7 @@ services: VITE_GATEWAY_URL: ${VITE_GATEWAY_URL:-} VITE_APP_VERSION: ${APP_VERSION:-dev} restart: unless-stopped + logging: *default-logging deploy: resources: limits: @@ -159,6 +177,7 @@ services: container_name: scrabble-telegram-vpn image: docker.iliadenisov.ru/developer/amneziawg-sidecar:latest restart: unless-stopped + logging: *default-logging privileged: true environment: AWG_CONF: ${AWG_CONF:?set AWG_CONF} @@ -173,6 +192,7 @@ services: context: .. dockerfile: platform/telegram/Dockerfile restart: unless-stopped + logging: *default-logging depends_on: [vpn] network_mode: "service:vpn" environment: @@ -212,6 +232,7 @@ services: container_name: scrabble-caddy image: caddy:2-alpine restart: unless-stopped + logging: *default-logging depends_on: [gateway, backend, grafana, landing] environment: # Test: ":80" (host caddy terminates TLS). Prod: a domain for own ACME. @@ -235,6 +256,7 @@ services: container_name: scrabble-otelcol image: otel/opentelemetry-collector-contrib:0.119.0 restart: unless-stopped + logging: *default-logging command: ["--config=/etc/otelcol/config.yaml"] # The docker_stats receiver reads per-container metrics from the Docker API, so the # collector (image UID 10001) joins the host's docker group to read the socket — @@ -255,6 +277,7 @@ services: container_name: scrabble-prometheus image: prom/prometheus:v2.55.1 restart: unless-stopped + logging: *default-logging command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.retention.time=15d @@ -271,21 +294,24 @@ services: container_name: scrabble-tempo image: grafana/tempo:2.7.1 restart: unless-stopped + logging: *default-logging command: ["-config.file=/etc/tempo/tempo.yaml"] volumes: - ${SCRABBLE_CONFIG_DIR:-.}/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro - tempo-data:/var/tempo - # tempo peaked at ~446 MiB in R2; 1G leaves headroom for the final run. + # R7 tuned: tempo reached the 1 GiB cap during the final run (446 MiB in R2); + # raised to 2 GiB for headroom against OOM under sustained tracing load. deploy: resources: limits: - memory: 1G + memory: 2G networks: [internal] grafana: container_name: scrabble-grafana image: grafana/grafana:11.4.0 restart: unless-stopped + logging: *default-logging depends_on: [prometheus, tempo] environment: # Served under /_gm/grafana behind caddy's Basic-Auth; anonymous Admin so a @@ -322,6 +348,7 @@ services: container_name: scrabble-postgres-exporter image: prometheuscommunity/postgres-exporter:v0.16.0 restart: unless-stopped + logging: *default-logging depends_on: [postgres] environment: DATA_SOURCE_NAME: postgresql://${POSTGRES_USER:-scrabble}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-scrabble}?sslmode=disable From 2a48df9b83afeba81ee24c60e0f6f1c0d73f3ea7 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 11 Jun 2026 11:18:57 +0200 Subject: [PATCH 5/6] R7: trip report + docs/tracker bake-back; mark R7 done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - loadtest/REPORT-R7.md: the final stress-run report — method, the 500-player resource profile, the agreed tuning, the validation (transport_error 2.49% -> 0.72% at 3 gateway cores; the burst run showing connection-bound behavior), and the prod-sizing recommendation for Stage 18. - loadtest/README.md: per-player transports, --cpus capping, docker_stats (was cAdvisor), the absolute BACKEND_DICT_DIR for ./loadtest/... , and report links. - docs/TESTING.md + docs/ARCHITECTURE.md: observability now uses the otelcol docker_stats receiver (cAdvisor removed); links to both trip reports. - CLAUDE.md: repo-layout line reflects docker_stats + per-service limits. - PRERELEASE.md: R7 marked done in the tracker + heading; a Refinements entry recording the decisions, findings, applied tuning and validation. This is the final pre-release hardening phase; Stage 18 (prod cutover) is next. --- CLAUDE.md | 2 +- PRERELEASE.md | 37 ++++++++- docs/ARCHITECTURE.md | 12 +-- docs/TESTING.md | 6 +- loadtest/README.md | 36 +++++--- loadtest/REPORT-R7.md | 185 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 257 insertions(+), 21 deletions(-) create mode 100644 loadtest/REPORT-R7.md diff --git a/CLAUDE.md b/CLAUDE.md index 28a1502..3d7e611 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -128,7 +128,7 @@ gateway/ ui/ pkg/ # added by their stages platform/telegram/ # Telegram connector side-service (Stage 9): bot + gRPC API loadtest/ # module scrabble/loadtest: the pre-release stress harness (R2) backend/Dockerfile gateway/Dockerfile platform/telegram/Dockerfile loadtest/Dockerfile # multi-stage distroless (Stage 16; loadtest R2); gateway/Dockerfile also has the `landing` target (R3) -deploy/ # docker-compose + caddy + landing + otelcol/prometheus/tempo/grafana (+ cAdvisor/postgres_exporter, R2) +deploy/ # docker-compose (per-service limits, R7) + caddy + landing + otelcol (OTLP + docker_stats per-container metrics) + prometheus/tempo/grafana + postgres_exporter ``` ## Build & test diff --git a/PRERELEASE.md b/PRERELEASE.md index 8f3fe12..c05acd4 100644 --- a/PRERELEASE.md +++ b/PRERELEASE.md @@ -23,7 +23,7 @@ the edge before prod. Each phase maps back to the owner's raw pre-release TODO l | R4 | Push enrichment + kill the last poll | 4 + 5 | **done** | | R5 | Bundle slimming | 6 | **done** | | R6 | Refactor + docs reconciliation + de-staging | 7 | **done** | -| R7 | Final stress run + tuning | 9b | todo | +| R7 | Final stress run + tuning | 9b | **done** | | → | Stage 18 — prod contour deploy | — | see [`PLAN.md`](PLAN.md) | ## Key findings (these reshaped the raw list — read before starting a phase) @@ -168,7 +168,7 @@ regression gate. Incorporates the early-run (R2) bug fixes not already shipped. - Open details: the structural-changes list itself (owner-approved before applying); the test consolidation targets. -### R7 — Final stress run + tuning *(TODO 9, part 2)* — before Stage 18 +### R7 — Final stress run + tuning *(TODO 9, part 2)* — done Re-run the R2 harness against the final, refactored system on a clean contour; analyse resource consumption across **all** components (gateway, backend, Postgres, the metrics/observability stack, docker log volume) and agree the tuning (pool sizes, rate @@ -380,3 +380,36 @@ Then Stage 18. 10 files) into `backend/internal/inttest/helpers.go`; single-file helpers stay local. Pure relocation. - **No schema change → no contour DB wipe.** Regression gate: the full unit + integration + UI suites plus the R7 stress run. + +- **R7** (interview + implementation): + - **Locked decisions:** run the harness **same-host** (one-shot container on `scrabble-internal`, capped + `--cpus=3` so the contour keeps spare cores); **apply container limits + `GOMAXPROCS` now** (not just a + prod recommendation); **replace cAdvisor with the otelcol `docker_stats` receiver** (it resolved only the + root cgroup on this host); keep rate-limit / h2c knobs **compiled-in** (change values only if the data + demands — it did not). + - **Harness refinements (pre-run):** each virtual player builds its **own `edge.Client`** (its own h2c + connection for its Subscribe stream + Execute calls) instead of all players sharing one `http2.Transport` — + the R2 `transport_error` artifact; and `playTurn` now reports a **finished** game so the player drops it + from rotation. Effect, measured: `game.state` `transport_error` 14 % (R2) → **2.49 %**; `game_finished` on + chat ≈ 3 900 → **35**. + - **Observability:** added the `docker_stats` receiver to `otelcol` (`api_version: "1.44"` — the daemon's + minimum is 1.40; the receiver defaults to 1.25 and crash-looped until pinned), mounted the docker socket + read-only with `group_add` (the contrib image runs as UID 10001), dropped the cAdvisor service + its + Prometheus job, and retargeted the **Scrabble — Resources** dashboard to the docker_stats metric names + (`container_cpu_utilization`/100 == cores). Cross-checked against `docker stats` within sampling error. + - **Profile (final run, 500 players, limits in force):** the **gateway is the binding constraint** — with + one connection per player it bursts into its 2-core cap (the residual 2.49 % `transport_error`); backend + ~0.85 core and postgres ~1.4 cores had headroom; **tempo reached its 1 GiB cap**; the backend pool sat at + its `MaxOpenConns=25` cap (28 backends); docker logs were unbounded (~14 MiB / 30 min on the backend at + info). Full write-up in [`../loadtest/REPORT-R7.md`](../loadtest/REPORT-R7.md). + - **Round-2 tuning (owner-agreed, all in `deploy/docker-compose.yml`, no code change):** gateway **2 → 3 + cores + `GOMAXPROCS=3`**; tempo memory **1 → 2 GiB**; backend `MAX_OPEN_CONNS` **25 → 40**; a json-file + **log-rotation** default (10m × 3) applied contour-wide via a YAML anchor (level stays info). + backend/postgres kept at 2 cores / 512 MiB (headroom is cheap on the shared host). + - **Validation:** the same gradual ramp on the tuned contour cut `game.state` `transport_error` to **0.72 %** + (gateway ~2 cores, now under the 3-core cap, no throttle; tempo ~1.27 GiB, under 2 GiB). A separate + **burst** run (a single 100 → 500 jump) pegged the gateway at 3 cores (≈296 % sustained, 9.27 % error), + confirming it is **connection-CPU-bound** — a true arrival spike is a **horizontal-scaling** lever, not + more cores per node (recorded in the prod-sizing recommendation). + - **No schema change → no contour DB wipe.** Bake-back: `loadtest/REPORT-R7.md` (new), `loadtest/README.md`, + `docs/TESTING.md`, the telemetry/observability section of `docs/ARCHITECTURE.md`, the repo-layout line in `CLAUDE.md`. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 0e4e361..e3a481e 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -561,11 +561,13 @@ promotions) is future work and would deliver short markdown messages (text + lin metrics + Tempo traces), **Prometheus** (15d), **Tempo** (72h) and **Grafana** (provisioned datasources + dashboards, behind the caddy `/_gm/grafana` Basic-Auth) are stood up with the deploy (`deploy/`); the default exporter stays - `none`, so CI needs no collector. The contour also runs **cAdvisor** (per-container - CPU/memory/network) and **postgres_exporter** (connections, cache-hit ratio, - transactions, db size), scraped by Prometheus and surfaced on the **Scrabble — - Resources** Grafana dashboard, which captures a resource - baseline; these export directly in Prometheus format (not through the collector). + `none`, so CI needs no collector. The collector also runs a **`docker_stats`** + receiver (per-container CPU/memory/network read from the Docker API and exported + through its Prometheus endpoint), and the contour runs **postgres_exporter** + (connections, cache-hit ratio, transactions, db size, scraped directly by Prometheus); + both are surfaced on the **Scrabble — Resources** Grafana dashboard, which captures the + stress-run resource profile. (`docker_stats` replaced cAdvisor, which on the contour + host resolved only the root cgroup — a separate-XFS `/var/lib/docker`.) - Per-request server-side timing via gin middleware from day one (the access log carries method, route, status, latency and the active trace id). A client-measured RTT piggybacked on the next request is a later enhancement. diff --git a/docs/TESTING.md b/docs/TESTING.md index 051d42e..d3012b8 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -127,8 +127,10 @@ tests or touching CI. selection, the report); the DAWG-backed move test runs under `BACKEND_DICT_DIR` (as the engine tests do). It is **not** part of the per-PR suite's behavioural assertions: it runs ad hoc as a one-shot container against the contour, producing a trip report (bugs - + a resource baseline) read off the **cAdvisor + postgres_exporter** Grafana dashboard - on the contour. See [`../loadtest/README.md`](../loadtest/README.md). + + a per-container resource profile) read off the **otelcol `docker_stats` + + postgres_exporter** Grafana dashboard on the contour. Two passes are recorded — the + early [`REPORT-R2.md`](../loadtest/REPORT-R2.md) and the final, tuned + [`REPORT-R7.md`](../loadtest/REPORT-R7.md). See [`../loadtest/README.md`](../loadtest/README.md). ## Principles diff --git a/loadtest/README.md b/loadtest/README.md index d12845f..5ffb762 100644 --- a/loadtest/README.md +++ b/loadtest/README.md @@ -36,17 +36,21 @@ container on the contour's docker network (this bypasses the host→gateway hair # from the repo root docker build -f loadtest/Dockerfile -t scrabble-loadtest . -docker run --rm --name scrabble-loadtest --network scrabble-internal \ +docker run --rm --cpus=3 --name scrabble-loadtest --network scrabble-internal \ -e POSTGRES_PASSWORD="$TEST_POSTGRES_PASSWORD" \ scrabble-loadtest run ``` -Defaults assume the contour service names: `postgres:5432` and `gateway:8081`. The -DAWGs are baked into the image (`/opt/dawg`, pinned to the dictionary release). Run with +Each virtual player gets its own `edge.Client` (its own h2c connection), mirroring real +clients rather than multiplexing every player over one transport. Defaults assume the +contour service names: `postgres:5432` and `gateway:8081`. The DAWGs are baked into the +image (`/opt/dawg`, pinned to the dictionary release). On a host shared with the contour, +cap the harness (`--cpus=3`) so the contour keeps the spare cores. Run with `--name scrabble-loadtest` so the harness's own CPU/memory show up as a `scrabble-*` -series in cAdvisor (keeping it separable from the system under test). Capture the -resource baseline from the Grafana **Scrabble — Resources** dashboard -(cAdvisor + postgres_exporter) while the run is in progress. +series in the metrics (keeping it separable from the system under test). Capture the +resource baseline from the Grafana **Scrabble — Resources** dashboard (the otelcol +`docker_stats` receiver + postgres_exporter), or from `docker stats` directly, while the +run is in progress. ## Commands & flags @@ -80,15 +84,25 @@ DB wipe (`DROP SCHEMA backend CASCADE` + backend restart). ```sh go build ./loadtest/... go vet ./loadtest/... -BACKEND_DICT_DIR=../scrabble-solver/dawg go test -count=1 ./loadtest/... +BACKEND_DICT_DIR="$PWD/../scrabble-solver/dawg" go test -count=1 ./loadtest/... ``` The DAWG-backed `moves` test runs only when `BACKEND_DICT_DIR` is set (as the engine tests use); the pure logic (hashing, board replay, rack build, move selection, report) -runs unconditionally. +runs unconditionally. Use an **absolute** path (here via `$PWD`): `go test ./loadtest/...` +runs each package from its own directory, so a relative `BACKEND_DICT_DIR` would not +resolve. + +## Trip reports + +The two stress passes are written up in the repo: the early pass in +[`REPORT-R2.md`](REPORT-R2.md) and the final, tuned pass in +[`REPORT-R7.md`](REPORT-R7.md). ## Caveat -The harness shares the host CPU with the contour, so the early-pass resource baseline -is read with the harness's own container series in mind; a cleaner number on separate -hardware is future work. The moderate ramp keeps the generator from being the bottleneck. +The harness shares the host CPU with the contour, so its own `scrabble-loadtest` +container series is read alongside the system under test; capping it with `--cpus` +keeps the contour's quota. Per-player transports (R7) removed the shared-transport +artifact that inflated R2's `transport_error`, so the figures reflect the system. A +fully isolated ceiling on separate hardware remains future work. diff --git a/loadtest/REPORT-R7.md b/loadtest/REPORT-R7.md new file mode 100644 index 0000000..1c9baf8 --- /dev/null +++ b/loadtest/REPORT-R7.md @@ -0,0 +1,185 @@ +# R7 — final stress-run trip report + +The final pre-release stress pass for [`PRERELEASE.md`](../PRERELEASE.md) R7. It re-runs +the R2 harness (`scrabble/loadtest`) against the **final, refactored system** on a +freshly redeployed contour, to confirm the system holds at scale and to settle the +resource sizing (container limits, `GOMAXPROCS`, pools, rate limits, log levels) before +the Stage 18 prod cutover. Pass bar: **diagnostic + a tuning decision** — the run +"passes" by completing cleanly; the per-container resource profile drives the tuning +recorded below. Companion to the early pass, [`REPORT-R2.md`](REPORT-R2.md). + +## What changed since the R2 pass + +- **Harness — per-player transports.** Each virtual player now owns its `edge.Client` + (its own `http2.Transport` / h2c connection carrying both its `Subscribe` stream and + its `Execute` calls), instead of all players multiplexing over one shared transport. + R2 traced the ~14 % `transport_error` on `game.state` at 500 players to that single + shared connection's stream limit; per-player connections mirror real clients and + remove the artifact, so this pass measures the system, not the harness. +- **Harness — drop finished games.** `playTurn` reports a finished game and the player + drops it from its rotation, so secondary ops stop hitting `game_finished` on ended + games (the other R2 harness finding). +- **Observability — otelcol `docker_stats`.** cAdvisor (which resolves only the root + cgroup on this host — separate-XFS `/var/lib/docker`) is replaced by the otelcol + `docker_stats` receiver, reading per-container CPU/memory/network from the Docker API. + Per-container panels now populate on the contour host. (`api_version` pinned to 1.44; + the daemon's minimum is 1.40.) +- **Contour — container limits + `GOMAXPROCS`.** `deploy.resources.limits` now bound + every service; the Go services pin `GOMAXPROCS` to their CPU limit so the runtime + matches the cgroup quota. Starting values were generous over the R2 peak; this pass + validates them and settles the agreed sizing (below). + +## Method + +Unchanged from R2 except for the per-player transports and the dropped-finished-games +refinement above: + +- **Driver:** the `scrabble/loadtest` module, run as a one-shot container on the + `scrabble-internal` docker network (reaching `postgres:5432` / `gateway:8081` + directly), capped at `--cpus 3` so the contour keeps the host's spare cores. +- **Seed:** 10 000 durable + 1 000 guest accounts with pre-created sessions written + straight to Postgres (token hash matches `backend/internal/session`). +- **Games:** assembled through the real **invitation** flow, 2–4 players each, no + robots; variants over scrabble_en / scrabble_ru / erudit_ru. +- **Play:** each player holds a live `Subscribe` stream and, per tick, polls + `game.state`, replays `game.history` and submits a **mid-ranked** legal move generated + locally by the embedded `scrabble-solver`, or passes / exchanges; a fraction exercise + nudge / chat / check-word / draft / profile / stats. A separate **gateway-hammer** + floods `games.list` from one account. +- **Scale:** the same moderate ramp **50 → 200 → 500** concurrent players, 10 min/step. +- **Resource capture:** `docker stats` (docker API) sampled every ~20 s for per-container + CPU/memory; the otelcol **`docker_stats`** receiver → Prometheus → the Grafana + **Scrabble — Resources** dashboard for the same per-container series; `postgres_exporter` + internals and per-service Go runtime metrics. + +## Run configuration + +``` +docker run --rm --cpus=3 --name scrabble-loadtest --network scrabble-internal \ + -e POSTGRES_PASSWORD=… scrabble-loadtest \ + run --durable 10000 --guest 1000 --steps 50,200,500 --step-dur 10m \ + --tick 800ms --hammer-workers 20 --hammer-dur 15s --reset --cleanup +``` + +Date: 2026-06-10. Contour: the R1-baseline schema, freshly redeployed with the R7 +container limits / `GOMAXPROCS` (backend/gateway/postgres capped at 2 cores + 512 MiB, +`GOMAXPROCS=2`) and the `docker_stats` observability. Seeded population removed by +`--cleanup` afterwards. + +## Findings + +The ramp ran clean to 500 players — no harness crash, no deadlock, `stream errors: 0` — +and cleanup removed all 11 000 seeded accounts. + +- **Volume (1827 s):** 821 680 edge calls (449.7 req/s incl. the hammer). Real gameplay + at scale: **50 916 committed plays**, 4 817 passes, 2 931 games finished; 165 755 + `opponent_moved` + 54 864 `your_turn` events. +- **The per-player transport fix worked.** `game.state` returned `transport_error` on + **3 173 / 127 403 = 2.49 %** of calls — down from R2's ~14 % on the same step. Other + ops were lower still (`game.history` 0.43 %, `game.submit_play` 0.28 %). The residual + is the gateway bursting into its 2-core cap (see the profile below), not the harness. +- **Dropping finished games worked.** `game_finished` on `chat.nudge` / `chat.post` fell + to **35 / 36** (R2: ≈ 3 900 each) — secondary ops no longer hammer ended games. +- **The limiter holds.** The gateway-hammer sent 565 152 `games.list`; **564 979 + (99.97 %) were `rate_limited`** (154 ok burst, 19 deadline), p99 = 2 ms, ~309 req/s of + rejections sustained — unchanged from R2. +- **Latency (peak):** `game.state` p50 ≈ 100 ms, p99 in the 2000 ms bucket (max 2549 ms); + `game.submit_play` p50 100 / p99 1000 ms bucket. Lobby ops stayed fast + (invitation / games.list p99 ≤ 10 ms). The p99 tail correlates with the gateway + burst-throttling, not the backend (which stayed at ~0.85 core). + +## Resource profile + +Per-container peak during step 3 (500 players), with the R7 starting limits in force +(backend/gateway/postgres capped at 2 cores / 512 MiB). Two CPU columns: `docker stats` +samples a ~1 s window (catches bursts); the otelcol `docker_stats` receiver averages over +its 30 s collection interval (smooths them) — they agree within sampling error, which +validates the new observability path. + +| container | CPU burst (1 s) | CPU sustained (30 s) | CPU cap | mem peak | mem cap | +|-----------|----------------:|---------------------:|--------:|---------:|--------:| +| scrabble-gateway | **217 %** (at cap) | ~145 % | 200 % | 167 MiB | 512 MiB | +| scrabble-postgres | 138 % | ~153 % | 200 % | 117 MiB | 512 MiB | +| scrabble-backend | 85 % | ~89 % | 200 % | 116 MiB | 512 MiB | +| scrabble-tempo | 33 % | — | (none) | **1024 MiB** (at cap) | 1024 MiB | +| scrabble-otelcol | 11 % | — | (none) | 131 MiB | 512 MiB | +| scrabble-loadtest (harness) | 157 % | — | 300 % | 369 MiB | — | + +- **The gateway is the binding constraint.** With one h2c connection per player it draws + ~1.45 cores sustained and **bursts to its 2-core cap** at 500 players, throttling + briefly — the source of the 2.49 % `transport_error`. R2 saw only ~0.93 core because + all 500 players shared one connection; the +~0.5 core is the realistic per-connection + overhead (500 separate HTTP/2 connections). This is a sizing fact, not a regression. +- **backend is over-provisioned** (~0.85 core vs a 2-core cap); **postgres** (~1.4 cores) + has headroom; both stayed ≤ 120 MiB. +- **tempo reached its 1 GiB memory cap** (R2: 446 MiB) — an OOM risk under sustained + tracing. +- **Postgres backends peaked at 28**, with the backend pool at its `MaxOpenConns=25` cap. + Cache hit stayed ~100 % (no disk reads); CPU, not I/O, is the limit. +- **docker log volume (30 min):** backend 14.2 MiB, gateway 4.6 MiB, postgres 0.04 MiB — + the backend's per-request latency line at info dominates, and json-file logs had no + rotation. + +## Tuning applied + +Agreed from the profile (all in `deploy/docker-compose.yml`; no code change — the pool +is already env-driven): + +| knob | from | to | why | +|------|------|----|-----| +| gateway CPU + `GOMAXPROCS` | 2 cores / 2 | **3 cores / 3** | it bursts into the 2-core cap at 500 players (the 2.49 % `transport_error`); 3 absorbs the bursts | +| tempo memory | 1 GiB | **2 GiB** | it reached the 1 GiB cap (OOM risk) | +| backend `MAX_OPEN_CONNS` | 25 | **40** | the pool sat at its 25-conn cap at peak; headroom trims the p99 tail | +| docker logs | unbounded | **json-file 10m × 3** | bound the ~14 MiB / 30 min backend log; level stays `info` | + +Left as-is: backend / postgres at 2 cores / 512 MiB (peak ~0.85 / ~1.4 cores — headroom +is cheap on the shared host); the per-user rate limiter and `h2cMaxConcurrentStreams=250` +(per-connection now, ~1 stream each — ample) and cache TTLs (no pressure observed). + +### Validation re-run + +Re-running the **same gradual ramp** (50 → 200 → 500) on the tuned contour confirms the +fix: + +- **`game.state` `transport_error` fell to 0.72 %** (853 / 119 051), down from 2.49 % at + 2 cores. The latency tail also improved — p99 in the 1000 ms bucket, max 1220 ms (was + the 2000 ms bucket, max 2549 ms). +- The **gateway peaked at ~2 cores** (≈196 % on the 30 s gauge) — now comfortably **under + the 3-core cap**, so it no longer throttles. backend ~1 core, postgres ~1.3 cores. +- **tempo peaked at ~1.27 GiB** — under the new 2 GiB cap (it would have OOM-ed at 1 GiB). +- Drop-finished still holds (`game_finished` on chat 41/42); the limiter still rejects + 99.97 % of the hammer at p99 2 ms; `stream errors: 0`. + +A separate **burst stress** (a single 100 → 500 jump — 400 players connecting at once) +**pegged the gateway at 3 cores** (≈296 % sustained) and pushed `game.state` +`transport_error` to 9.27 %. The gateway is **connection-CPU-bound and bursty**: average +load is ~1 core, but a mass-simultaneous connection storm saturates whatever single-node +cap it is given. Real arrivals are gradual (the canonical run), where 3 cores has +headroom; the lever for a true arrival spike is **horizontal scaling**, not more cores per +node — carried into the prod recommendation below. + +## Prod-sizing recommendation (Stage 18) + +The contour is **CPU-bound and gateway-led** at 500 concurrent players. Carry these to the +prod contour env (the same compose, `PROD_*` values): + +- **gateway: ≥ 3 cores** per ~500 concurrent players, `GOMAXPROCS` pinned to the limit — + it scales with the **connection count**, not just the request rate; beyond one node's + worth, scale the gateway **horizontally** rather than vertically. +- **backend: ~1–2 cores**, pool 40 — comfortable; the work is light per request. +- **postgres: ~2 cores / ≥ 512 MiB** — ~1.4 cores at 500 players, 100 % cache hit. +- **tempo: ≥ 2 GiB**; the Go services run under ~170 MiB (256 MiB would suffice, 512 is + safe); pin `GOMAXPROCS` to each CPU limit; keep json-file rotation. +- Memory is not the constraint anywhere; CPU is. + +## Re-running + +See [`README.md`](README.md). Briefly, from the repo root: + +```sh +docker build -f loadtest/Dockerfile -t scrabble-loadtest . +docker run --rm --cpus=3 --name scrabble-loadtest --network scrabble-internal \ + -e POSTGRES_PASSWORD=… scrabble-loadtest run --reset --cleanup +``` + +The harness stays in the repo for future repeats. From 225188e4b5f582b851b584341e7002774ada32e8 Mon Sep 17 00:00:00 2001 From: Ilia Denisov Date: Thu, 11 Jun 2026 11:32:09 +0200 Subject: [PATCH 6/6] R7: add a VPS/VDS sizing table (min/avg/max) to the trip report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A practical single-host ordering guide — CPU cores, RAM, disk at three tiers — grounded in the R7 profile (~5.5 cores / ~2.5 GiB peak at 500 players) and the measured on-disk footprint (images ~2.4 GB; Tempo 3.1 GB at 72 h; the game DB 23 MiB and growing). Notes which knobs move disk (Tempo/Prometheus retention, Postgres growth) and that the gateway scales horizontally past one host. --- loadtest/REPORT-R7.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/loadtest/REPORT-R7.md b/loadtest/REPORT-R7.md index 1c9baf8..2f03999 100644 --- a/loadtest/REPORT-R7.md +++ b/loadtest/REPORT-R7.md @@ -172,6 +172,33 @@ prod contour env (the same compose, `PROD_*` values): safe); pin `GOMAXPROCS` to each CPU limit; keep json-file rotation. - Memory is not the constraint anywhere; CPU is. +### VPS / VDS sizing (single-host contour) + +The whole contour (the app + the observability stack) runs on one host via +`docker-compose`. The tiers below are grounded in the R7 profile (**≈5.5 cores / ≈2.5 GiB +RAM peak at 500 concurrent players**; ≈0.5 GiB idle) and the **measured** on-disk +footprint: prod images ≈2.4 GB; the Tempo volume **3.1 GB at 72 h** retention; Prometheus +≈1–2 GB at 15 d; the game DB 23 MiB and growing with history. CPU and disk grow; RAM has +the most slack. + +| tier | CPU | RAM | disk | handles | +|------|-----|-----|------|---------| +| **Minimum** | 2 cores | 2 GiB | 20 GiB | ~up to ~150 concurrent; lower the compose limits (gateway 1.5 / backend·postgres 1 / tempo 1 GiB) to fit the box | +| **Average** (reasonable load) | 4 cores | 4 GiB | 40 GiB | ~300–400 concurrent comfortably; the tested 500 with occasional gateway burst-throttling | +| **Maximum** (worry-free) | 8 cores | 8 GiB | 80 GiB | 500+ concurrent with full gateway burst headroom (its 3-core cap) + room to grow; the compose limits fit as-is | + +- The per-service limits in `docker-compose.yml` are tuned for the **Average/Maximum** + target (the gateway alone caps at 3 cores). On the **Minimum** tier, scale them down to + match the host or the caps over-subscribe it. +- **Disk is dominated by observability retention + DB growth.** Tempo (72 h traces) and + Prometheus (15 d metrics) are the main levers — shorten the windows (or move Tempo to + object storage) to cut disk; Postgres grows with game history, so budget for months of + it; container logs are already capped (json-file 10m × 3 ≈ 30 MiB each). +- **RAM** rarely binds: the contour peaks ≈2.5 GiB at 500 players and the sum of all + configured limits is ≈5.6 GiB, so 8 GiB never strains. +- Beyond one host's worth of players, scale the **gateway horizontally** (it is + connection-CPU-bound) rather than ordering an ever-bigger box. + ## Re-running See [`README.md`](README.md). Briefly, from the repo root: