R2: load-test harness + contour resource observability
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 9s
CI / integration (pull_request) Successful in 11s
CI / ui (pull_request) Successful in 38s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Failing after 3s

New scrabble/loadtest module (the pre-release stress harness): seeds 1000 guest +
10000 durable accounts with pre-created sessions directly in Postgres (token hash
matches backend/internal/session), drives virtual players through the edge protocol
(real 2-4p games assembled via invitations, mid-ranked legal moves generated locally
by the embedded scrabble-solver — the edge carries no board, so the client replays
history), plus nudge/chat/check-word/draft/profile/stats and a gateway-hammer that
verifies the rate limiter. Prints a trip-report summary (per-op latency percentiles,
result codes, live-event tally). Go unit tests cover the pure pieces; the DAWG-backed
move test runs under BACKEND_DICT_DIR.

Contour: add cAdvisor + postgres_exporter + a 'Scrabble - Resources' Grafana
dashboard and the two Prometheus scrape jobs, for the R2/R7 stress-run resource
baseline.

CI: gate ./loadtest/... (path filter + vet/build/test). Docs: TESTING, ARCHITECTURE,
project CLAUDE repo layout.
This commit is contained in:
Ilia Denisov
2026-06-09 23:45:24 +02:00
parent bf3ee62711
commit aa137e3558
27 changed files with 2554 additions and 7 deletions
+193
View File
@@ -0,0 +1,193 @@
// Command loadtest is the R2 reusable load harness. It seeds a large account
// population with pre-created sessions directly in the backend Postgres, then drives
// virtual players through the gateway edge protocol (real games assembled via
// invitations, legal moves generated locally by the embedded solver), and a
// gateway-hammer that verifies the rate limiter. It prints a trip-report summary.
//
// Run it as a one-shot container on the contour's docker network so it reaches
// postgres:5432 and gateway:8081 directly:
//
// docker run --rm --network scrabble-internal \
// -e POSTGRES_PASSWORD=... -v /path/to/dawg:/dawg scrabble-loadtest run
package main
import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"scrabble/loadtest/internal/edge"
"scrabble/loadtest/internal/moves"
"scrabble/loadtest/internal/report"
"scrabble/loadtest/internal/scenario"
"scrabble/loadtest/internal/seed"
)
func main() {
log := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))
if len(os.Args) < 2 {
usage()
os.Exit(2)
}
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
var err error
switch os.Args[1] {
case "run":
err = cmdRun(ctx, log, os.Args[2:])
case "cleanup":
err = cmdCleanup(ctx, log, os.Args[2:])
default:
usage()
os.Exit(2)
}
if err != nil {
log.Error("loadtest failed", "cmd", os.Args[1], "err", err)
os.Exit(1)
}
}
func usage() {
fmt.Fprintln(os.Stderr, "usage: loadtest <run|cleanup> [flags]")
fmt.Fprintln(os.Stderr, " run seed accounts, drive the realistic ramp + gateway-hammer, print the report")
fmt.Fprintln(os.Stderr, " cleanup delete everything the harness seeded (matched by marker)")
}
func cmdRun(ctx context.Context, log *slog.Logger, args []string) error {
fs := flag.NewFlagSet("run", flag.ExitOnError)
gateway := fs.String("gateway", env("LOADTEST_GATEWAY_URL", "http://gateway:8081"), "gateway base URL")
dsn := fs.String("dsn", env("LOADTEST_DSN", defaultDSN()), "backend Postgres DSN")
dawgDir := fs.String("dawg", env("LOADTEST_DAWG_DIR", "/dawg"), "directory holding the committed *.dawg files")
durable := fs.Int("durable", 10000, "durable accounts to seed")
guest := fs.Int("guest", 1000, "guest accounts to seed")
stepsStr := fs.String("steps", "50,200,500", "comma-separated concurrent-player ramp steps")
stepDur := fs.Duration("step-dur", 12*time.Minute, "hold time per ramp step")
gpp := fs.Int("games-per-player", 0, "target concurrent games per player (0 => random 3..5)")
tick := fs.Duration("tick", 800*time.Millisecond, "per-player operation cadence")
secProb := fs.Float64("secondary-prob", 0.08, "chance per tick of a non-move operation")
hammerWorkers := fs.Int("hammer-workers", 20, "gateway-hammer concurrent callers (0 disables)")
hammerDur := fs.Duration("hammer-dur", 15*time.Second, "gateway-hammer duration")
reset := fs.Bool("reset", false, "delete prior harness rows before seeding")
doCleanup := fs.Bool("cleanup", false, "delete harness rows after the run")
if err := fs.Parse(args); err != nil {
return err
}
steps, err := parseSteps(*stepsStr)
if err != nil {
return err
}
reg, err := moves.Open(*dawgDir)
if err != nil {
return err
}
defer reg.Close()
sd, err := seed.New(ctx, *dsn)
if err != nil {
return err
}
defer sd.Close()
if *reset {
n, err := sd.Cleanup(ctx)
if err != nil {
return err
}
log.Info("reset", "accounts_removed", n)
}
log.Info("seeding", "durable", *durable, "guest", *guest)
pool, err := sd.Seed(ctx, *durable, *guest)
if err != nil {
return err
}
log.Info("seeded", "durable", len(pool.Durables), "guest", len(pool.Guests))
rec := report.New()
drv := scenario.NewDriver(edge.New(*gateway), reg, rec, log)
cfg := scenario.RealisticConfig{
Steps: steps, StepDur: *stepDur, GamesPerPlayer: *gpp,
Tick: *tick, SecondaryProb: *secProb,
}
if err := drv.RunRealistic(ctx, pool, cfg); err != nil && !errors.Is(err, context.Canceled) {
return err
}
if *hammerWorkers > 0 && ctx.Err() == nil && len(pool.Durables) > 0 {
drv.Hammer(ctx, pool.Durables[0], scenario.HammerConfig{Workers: *hammerWorkers, Duration: *hammerDur})
}
fmt.Println("\n==== R2 load-test report ====")
fmt.Println(rec.Summary())
if *doCleanup {
n, err := sd.Cleanup(context.WithoutCancel(ctx))
if err != nil {
return err
}
log.Info("cleanup", "accounts_removed", n)
}
return nil
}
func cmdCleanup(ctx context.Context, log *slog.Logger, args []string) error {
fs := flag.NewFlagSet("cleanup", flag.ExitOnError)
dsn := fs.String("dsn", env("LOADTEST_DSN", defaultDSN()), "backend Postgres DSN")
if err := fs.Parse(args); err != nil {
return err
}
sd, err := seed.New(ctx, *dsn)
if err != nil {
return err
}
defer sd.Close()
n, err := sd.Cleanup(ctx)
if err != nil {
return err
}
log.Info("cleanup", "accounts_removed", n)
return nil
}
// defaultDSN builds the backend Postgres DSN from the standard POSTGRES_* env the
// contour uses, pinning the backend schema.
func defaultDSN() string {
return fmt.Sprintf("postgres://%s:%s@%s:5432/%s?sslmode=disable&search_path=backend",
env("POSTGRES_USER", "scrabble"), os.Getenv("POSTGRES_PASSWORD"),
env("POSTGRES_HOST", "postgres"), env("POSTGRES_DB", "scrabble"))
}
// env returns the environment variable key or def when it is unset/empty.
func env(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
return def
}
// parseSteps parses a comma-separated list of positive ramp step sizes.
func parseSteps(s string) ([]int, error) {
parts := strings.Split(s, ",")
steps := make([]int, 0, len(parts))
for _, p := range parts {
n, err := strconv.Atoi(strings.TrimSpace(p))
if err != nil || n <= 0 {
return nil, fmt.Errorf("invalid ramp steps %q", s)
}
steps = append(steps, n)
}
if len(steps) == 0 {
return nil, fmt.Errorf("no ramp steps")
}
return steps, nil
}