R2: load-test harness + contour resource observability

New scrabble/loadtest module (the pre-release stress harness): seeds 1000 guest + 10000 durable accounts with pre-created sessions directly in Postgres (token hash matches backend/internal/session), drives virtual players through the edge protocol (real 2-4p games assembled via invitations, mid-ranked legal moves generated locally by the embedded scrabble-solver — the edge carries no board, so the client replays history), plus nudge/chat/check-word/draft/profile/stats and a gateway-hammer that verifies the rate limiter. Prints a trip-report summary (per-op latency percentiles, result codes, live-event tally). Go unit tests cover the pure pieces; the DAWG-backed move test runs under BACKEND_DICT_DIR. Contour: add cAdvisor + postgres_exporter + a 'Scrabble - Resources' Grafana dashboard and the two Prometheus scrape jobs, for the R2/R7 stress-run resource baseline. CI: gate ./loadtest/... (path filter + vet/build/test). Docs: TESTING, ARCHITECTURE, project CLAUDE repo layout.
2026-06-09 23:45:24 +02:00
parent bf3ee62711
commit aa137e3558
27 changed files with 2554 additions and 7 deletions
@@ -0,0 +1,170 @@
+package scenario
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"time"
+
+	"scrabble/loadtest/internal/edge"
+	"scrabble/loadtest/internal/moves"
+	"scrabble/loadtest/internal/seed"
+)
+
+// Game is one assembled match: its id, variant and members in seat order (Members[0]
+// is the inviter, seat 0).
+type Game struct {
+	ID      string
+	Variant string
+	Members []seed.Account
+}
+
+// seatOf returns the seat index of accountID in the game, or -1.
+func (g *Game) seatOf(accountID string) int {
+	for i, m := range g.Members {
+		if m.ID.String() == accountID {
+			return i
+		}
+	}
+	return -1
+}
+
+// assembleCohort forms games among a cohort of active players via the invitation
+// flow, aiming for gamesPerPlayer (3-5) concurrent games per player with 2-4 players
+// each. It returns the games it managed to start. Failures are logged and skipped so
+// a partial assembly still drives load.
+func (d *Driver) assembleCohort(ctx context.Context, cohort []seed.Account, gamesPerPlayer int, rng *rand.Rand) []*Game {
+	if len(cohort) < 2 {
+		return nil
+	}
+	gamesOf := make(map[string]int, len(cohort))
+	var games []*Game
+	for i := range cohort {
+		inviter := cohort[i]
+		target := 3 + rng.Intn(3) // 3..5
+		if gamesPerPlayer > 0 {
+			target = gamesPerPlayer
+		}
+		for gamesOf[inviter.ID.String()] < target {
+			members := pickMembers(cohort, inviter, rng)
+			if len(members) < 2 {
+				break
+			}
+			variant := moves.Variants()[rng.Intn(len(moves.Variants()))]
+			g, err := d.assemble(ctx, members, variant)
+			if err != nil {
+				d.log.Debug("assemble game", "err", err)
+				break
+			}
+			games = append(games, g)
+			for _, m := range members {
+				gamesOf[m.ID.String()]++
+			}
+		}
+	}
+	return games
+}
+
+// pickMembers builds a 2-4 player group led by inviter, drawing distinct others from
+// the cohort at random.
+func pickMembers(cohort []seed.Account, inviter seed.Account, rng *rand.Rand) []seed.Account {
+	size := 2 + rng.Intn(3) // 2..4
+	members := []seed.Account{inviter}
+	seen := map[string]bool{inviter.ID.String(): true}
+	for attempts := 0; len(members) < size && attempts < 4*size; attempts++ {
+		cand := cohort[rng.Intn(len(cohort))]
+		if seen[cand.ID.String()] {
+			continue
+		}
+		seen[cand.ID.String()] = true
+		members = append(members, cand)
+	}
+	return members
+}
+
+// assemble runs the invitation flow for one game: the inviter (members[0]) invites
+// the rest, each invitee accepts the pending invitation, and the completing accept
+// starts the game, which is then located in the inviter's game list.
+func (d *Driver) assemble(ctx context.Context, members []seed.Account, variant string) (*Game, error) {
+	inviter := members[0]
+	inviteeIDs := make([]string, len(members)-1)
+	for i, m := range members[1:] {
+		inviteeIDs[i] = m.ID.String()
+	}
+
+	t0 := time.Now()
+	code, err := d.edge.CreateInvitation(ctx, inviter.Token, inviteeIDs, variant)
+	d.rec.Record("invitation.create", code, time.Since(t0))
+	if err != nil || code != "ok" {
+		return nil, fmt.Errorf("invitation.create: %s", code)
+	}
+
+	for _, invitee := range members[1:] {
+		t0 = time.Now()
+		list, lc, err := d.edge.ListInvitations(ctx, invitee.Token)
+		d.rec.Record("invitation.list", lc, time.Since(t0))
+		if err != nil || lc != "ok" {
+			return nil, fmt.Errorf("invitation.list: %s", lc)
+		}
+		invID := findPending(list, inviter.ID.String())
+		if invID == "" {
+			return nil, fmt.Errorf("no pending invitation from %s", inviter.ID)
+		}
+		t0 = time.Now()
+		ac, err := d.edge.AcceptInvitation(ctx, invitee.Token, invID)
+		d.rec.Record("invitation.accept", ac, time.Since(t0))
+		if err != nil || ac != "ok" {
+			return nil, fmt.Errorf("invitation.accept: %s", ac)
+		}
+	}
+
+	t0 = time.Now()
+	games, gc, err := d.edge.GamesList(ctx, inviter.Token)
+	d.rec.Record("games.list", gc, time.Since(t0))
+	if err != nil || gc != "ok" {
+		return nil, fmt.Errorf("games.list: %s", gc)
+	}
+	ids := make([]string, len(members))
+	for i, m := range members {
+		ids[i] = m.ID.String()
+	}
+	gameID := findGame(games, ids)
+	if gameID == "" {
+		return nil, fmt.Errorf("started game not found for %d members", len(members))
+	}
+	return &Game{ID: gameID, Variant: variant, Members: members}, nil
+}
+
+// findPending returns the id of a pending invitation from inviterID, or "".
+func findPending(list []edge.Invitation, inviterID string) string {
+	for _, inv := range list {
+		if inv.InviterID == inviterID && inv.Status == "pending" {
+			return inv.ID
+		}
+	}
+	return ""
+}
+
+// findGame returns the id of the active game whose seat set equals memberIDs, or "".
+func findGame(games []edge.Game, memberIDs []string) string {
+	want := make(map[string]bool, len(memberIDs))
+	for _, id := range memberIDs {
+		want[id] = true
+	}
+	for _, g := range games {
+		if !g.Active() || len(g.Seats) != len(memberIDs) {
+			continue
+		}
+		match := true
+		for _, s := range g.Seats {
+			if !want[s] {
+				match = false
+				break
+			}
+		}
+		if match {
+			return g.ID
+		}
+	}
+	return ""
+}
@@ -0,0 +1,45 @@
+package scenario
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"scrabble/loadtest/internal/seed"
+)
+
+// HammerConfig parameterises the gateway-hammer: how many concurrent callers and for
+// how long to deliberately exceed the per-user rate limit from a single account.
+type HammerConfig struct {
+	Workers  int
+	Duration time.Duration
+}
+
+// DefaultHammer returns a hammer that comfortably exceeds the 300/min per-user limit.
+func DefaultHammer() HammerConfig {
+	return HammerConfig{Workers: 20, Duration: 15 * time.Second}
+}
+
+// Hammer drives games.list from a single account far above the per-user rate limit to
+// verify the limiter holds — rejections surface as the "rate_limited" code — and to
+// measure its cost. Every call is recorded under "hammer:games.list" so the report
+// shows the ok/rate_limited split and the rejection latency separately from the
+// realistic traffic.
+func (d *Driver) Hammer(ctx context.Context, acc seed.Account, cfg HammerConfig) {
+	runCtx, cancel := context.WithTimeout(ctx, cfg.Duration)
+	defer cancel()
+	d.log.Info("gateway-hammer", "workers", cfg.Workers, "duration", cfg.Duration)
+	var wg sync.WaitGroup
+	for w := 0; w < cfg.Workers; w++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for runCtx.Err() == nil {
+				t0 := time.Now()
+				_, code, _ := d.edge.GamesList(runCtx, acc.Token)
+				d.rec.Record("hammer:games.list", code, time.Since(t0))
+			}
+		}()
+	}
+	wg.Wait()
+}
@@ -0,0 +1,241 @@
+// Package scenario drives virtual players against the gateway edge protocol: it
+// assembles real games through the invitation flow, then runs each player's turn
+// loop (poll state, replay history, generate a legal move with the embedded solver,
+// submit it) plus a fraction of secondary operations. It exposes the moderate
+// realistic ramp agreed for the R2 early pass and a separate gateway-hammer.
+package scenario
+
+import (
+	"context"
+	"log/slog"
+	"math/rand"
+	"sync"
+	"time"
+
+	"scrabble/loadtest/internal/edge"
+	"scrabble/loadtest/internal/moves"
+	"scrabble/loadtest/internal/report"
+	"scrabble/loadtest/internal/seed"
+)
+
+// Driver ties the edge client, the local move generator and the run recorder
+// together. All three are safe for concurrent use by many player goroutines.
+type Driver struct {
+	edge  *edge.Client
+	moves *moves.Registry
+	rec   *report.Recorder
+	log   *slog.Logger
+}
+
+// NewDriver builds a Driver.
+func NewDriver(c *edge.Client, m *moves.Registry, rec *report.Recorder, log *slog.Logger) *Driver {
+	return &Driver{edge: c, moves: m, rec: rec, log: log}
+}
+
+// RealisticConfig parameterises the under-the-limit ramp.
+type RealisticConfig struct {
+	Steps          []int         // concurrent active players per step (cumulative)
+	StepDur        time.Duration // hold time per step
+	GamesPerPlayer int           // target concurrent games per player; 0 => random 3..5
+	Tick           time.Duration // per-player operation cadence (keeps a player under the per-user limit)
+	SecondaryProb  float64       // chance per tick of a non-move operation
+}
+
+// DefaultRealistic returns the moderate ramp agreed for the R2 early pass: 50 -> 200
+// -> 500 concurrent players, ~12 minutes per step, ~1 op/s per player.
+func DefaultRealistic() RealisticConfig {
+	return RealisticConfig{
+		Steps:         []int{50, 200, 500},
+		StepDur:       12 * time.Minute,
+		Tick:          800 * time.Millisecond,
+		SecondaryProb: 0.08,
+	}
+}
+
+// RunRealistic runs the staged ramp. Each step activates more players (drawn from the
+// seeded pool), assembles a cohort of games for them and starts their turn loops; the
+// loops run until the whole ramp ends. Players from earlier steps keep playing, so
+// load is cumulative.
+func (d *Driver) RunRealistic(ctx context.Context, pool *seed.Pool, cfg RealisticConfig) error {
+	players := shuffledPool(pool)
+	runCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	var wg sync.WaitGroup
+	activated := 0
+	for si, target := range cfg.Steps {
+		if target > len(players) {
+			target = len(players)
+		}
+		cohort := players[activated:target]
+		activated = target
+		if len(cohort) >= 2 {
+			rng := rand.New(rand.NewSource(time.Now().UnixNano() + int64(si)))
+			games := d.assembleCohort(runCtx, cohort, cfg.GamesPerPlayer, rng)
+			byPlayer := gamesByPlayer(games)
+			d.log.Info("ramp step", "step", si+1, "active", activated, "cohort", len(cohort), "games", len(games))
+			for pi := range cohort {
+				p := cohort[pi]
+				wg.Add(1)
+				go func(p seed.Account, pg []*Game, sd int64) {
+					defer wg.Done()
+					d.playerLoop(runCtx, p, pg, cfg, rand.New(rand.NewSource(sd)))
+				}(p, byPlayer[p.ID.String()], time.Now().UnixNano()+int64(pi))
+			}
+		} else {
+			d.log.Warn("ramp step skipped: cohort too small", "step", si+1, "cohort", len(cohort))
+		}
+		select {
+		case <-time.After(cfg.StepDur):
+		case <-ctx.Done():
+			cancel()
+			wg.Wait()
+			return ctx.Err()
+		}
+	}
+	cancel()
+	wg.Wait()
+	return nil
+}
+
+// playerLoop runs one virtual player: a live-event subscription (loads the push hub,
+// counts events) plus a round-robin turn loop over the player's games.
+func (d *Driver) playerLoop(ctx context.Context, p seed.Account, games []*Game, cfg RealisticConfig, rng *rand.Rand) {
+	go d.subscribeLoop(ctx, p)
+	if len(games) == 0 {
+		<-ctx.Done()
+		return
+	}
+	ticker := time.NewTicker(cfg.Tick)
+	defer ticker.Stop()
+	gi := 0
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			g := games[gi%len(games)]
+			gi++
+			if rng.Float64() < cfg.SecondaryProb {
+				d.secondaryOp(ctx, p, g, rng)
+				continue
+			}
+			d.playTurn(ctx, p, g, rng)
+		}
+	}
+}
+
+// subscribeLoop holds the player's live-event stream open, counting events and
+// reconnecting with a brief backoff after a drop, until the run ends.
+func (d *Driver) subscribeLoop(ctx context.Context, p seed.Account) {
+	for ctx.Err() == nil {
+		err := d.edge.Subscribe(ctx, p.Token, func(e edge.Event) { d.rec.Event(e.Kind) })
+		if ctx.Err() != nil {
+			return
+		}
+		if err != nil {
+			d.rec.StreamErr()
+		}
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(time.Second):
+		}
+	}
+}
+
+// playTurn plays one turn in g when it is the player's move: fetch state, replay
+// history, pick a legal move and submit it (or exchange / pass).
+func (d *Driver) playTurn(ctx context.Context, p seed.Account, g *Game, rng *rand.Rand) {
+	seat := g.seatOf(p.ID.String())
+	if seat < 0 {
+		return
+	}
+	t0 := time.Now()
+	st, code, err := d.edge.State(ctx, p.Token, g.ID)
+	d.rec.Record("game.state", code, time.Since(t0))
+	if err != nil || code != "ok" || !st.Game.Active() || st.Game.ToMove != seat {
+		return
+	}
+
+	t0 = time.Now()
+	hist, hc, err := d.edge.History(ctx, p.Token, g.ID)
+	d.rec.Record("game.history", hc, time.Since(t0))
+	if err != nil || hc != "ok" {
+		return
+	}
+
+	action, err := d.moves.Pick(g.Variant, hist, st.Rack, st.BagLen, rng)
+	if err != nil {
+		d.log.Debug("pick move", "variant", g.Variant, "err", err)
+		return
+	}
+	switch action.Kind {
+	case "play":
+		t0 = time.Now()
+		_, c, _ := d.edge.SubmitPlay(ctx, p.Token, g.ID, action.Dir, action.Tiles)
+		d.rec.Record("game.submit_play", c, time.Since(t0))
+	case "exchange":
+		t0 = time.Now()
+		_, c, _ := d.edge.Exchange(ctx, p.Token, g.ID, action.Exchange)
+		d.rec.Record("game.exchange", c, time.Since(t0))
+	default:
+		t0 = time.Now()
+		_, c, _ := d.edge.Pass(ctx, p.Token, g.ID)
+		d.rec.Record("game.pass", c, time.Since(t0))
+	}
+}
+
+// secondaryOp exercises one of the non-move edge operations the plan calls out, so
+// the run touches nudge / chat / check-word / draft / profile / stats too.
+func (d *Driver) secondaryOp(ctx context.Context, p seed.Account, g *Game, rng *rand.Rand) {
+	t0 := time.Now()
+	switch rng.Intn(7) {
+	case 0:
+		c, _ := d.edge.Nudge(ctx, p.Token, g.ID)
+		d.rec.Record("chat.nudge", c, time.Since(t0))
+	case 1:
+		c, _ := d.edge.ChatPost(ctx, p.Token, g.ID, "gg")
+		d.rec.Record("chat.post", c, time.Since(t0))
+	case 2:
+		c, _ := d.edge.CheckWord(ctx, p.Token, g.ID, []byte{0, 1, 2})
+		d.rec.Record("game.check_word", c, time.Since(t0))
+	case 3:
+		c, _ := d.edge.DraftSave(ctx, p.Token, g.ID, `{"rack_order":[],"board_tiles":[]}`)
+		d.rec.Record("draft.save", c, time.Since(t0))
+	case 4:
+		c, _ := d.edge.DraftGet(ctx, p.Token, g.ID)
+		d.rec.Record("draft.get", c, time.Since(t0))
+	case 5:
+		lang := "en"
+		if rng.Intn(2) == 1 {
+			lang = "ru"
+		}
+		c, _ := d.edge.ProfileUpdate(ctx, p.Token, p.Name, lang)
+		d.rec.Record("profile.update", c, time.Since(t0))
+	default:
+		c, _ := d.edge.Stats(ctx, p.Token)
+		d.rec.Record("stats.get", c, time.Since(t0))
+	}
+}
+
+// shuffledPool returns every seeded account in random order, so an active set is a
+// representative mix of durable and guest accounts.
+func shuffledPool(pool *seed.Pool) []seed.Account {
+	all := pool.All()
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	rng.Shuffle(len(all), func(i, j int) { all[i], all[j] = all[j], all[i] })
+	return all
+}
+
+// gamesByPlayer indexes the assembled games by each member's account id.
+func gamesByPlayer(games []*Game) map[string][]*Game {
+	m := make(map[string][]*Game)
+	for _, g := range games {
+		for _, mem := range g.Members {
+			id := mem.ID.String()
+			m[id] = append(m[id], g)
+		}
+	}
+	return m
+}