R2: load-test harness + contour resource observability

New scrabble/loadtest module (the pre-release stress harness): seeds 1000 guest + 10000 durable accounts with pre-created sessions directly in Postgres (token hash matches backend/internal/session), drives virtual players through the edge protocol (real 2-4p games assembled via invitations, mid-ranked legal moves generated locally by the embedded scrabble-solver — the edge carries no board, so the client replays history), plus nudge/chat/check-word/draft/profile/stats and a gateway-hammer that verifies the rate limiter. Prints a trip-report summary (per-op latency percentiles, result codes, live-event tally). Go unit tests cover the pure pieces; the DAWG-backed move test runs under BACKEND_DICT_DIR. Contour: add cAdvisor + postgres_exporter + a 'Scrabble - Resources' Grafana dashboard and the two Prometheus scrape jobs, for the R2/R7 stress-run resource baseline. CI: gate ./loadtest/... (path filter + vet/build/test). Docs: TESTING, ARCHITECTURE, project CLAUDE repo layout.
2026-06-09 23:45:24 +02:00
parent bf3ee62711
commit aa137e3558
27 changed files with 2554 additions and 7 deletions
@@ -0,0 +1,204 @@
+// Package report collects per-operation latency, result-code and live-event counts
+// across all virtual players and renders a text summary for the R2 trip report. It
+// is safe for concurrent use. Latencies go into fixed buckets (a Prometheus-style
+// histogram) so percentiles cost no per-sample memory at load-test scale.
+package report
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+)
+
+// boundsMs are the upper bounds (milliseconds) of the latency histogram buckets; a
+// trailing overflow bucket catches anything slower.
+var boundsMs = []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000}
+
+type opStat struct {
+	count   int64
+	sumNs   int64
+	maxNs   int64
+	codes   map[string]int64
+	buckets []int64 // len(boundsMs)+1; last is the overflow bucket
+}
+
+func newOpStat() *opStat {
+	return &opStat{codes: map[string]int64{}, buckets: make([]int64, len(boundsMs)+1)}
+}
+
+func (s *opStat) record(code string, d time.Duration) {
+	s.count++
+	s.sumNs += int64(d)
+	if int64(d) > s.maxNs {
+		s.maxNs = int64(d)
+	}
+	s.codes[code]++
+	ms := float64(d) / float64(time.Millisecond)
+	i := sort.SearchFloat64s(boundsMs, ms)
+	s.buckets[i]++
+}
+
+// quantile estimates the q-th percentile (0<q<1) as the upper bound of the bucket
+// the q-th sample falls in; the overflow bucket renders as ">5000".
+func (s *opStat) quantile(q float64) string {
+	if s.count == 0 {
+		return "-"
+	}
+	target := int64(q*float64(s.count) + 0.5)
+	if target < 1 {
+		target = 1
+	}
+	var cum int64
+	for i, n := range s.buckets {
+		cum += n
+		if cum >= target {
+			if i == len(boundsMs) {
+				return ">5000"
+			}
+			return fmt.Sprintf("%g", boundsMs[i])
+		}
+	}
+	return ">5000"
+}
+
+// Recorder accumulates the run's measurements.
+type Recorder struct {
+	mu         sync.Mutex
+	ops        map[string]*opStat
+	events     map[string]int64
+	streamErrs int64
+	start      time.Time
+}
+
+// New returns an empty Recorder with the run clock started.
+func New() *Recorder {
+	return &Recorder{ops: map[string]*opStat{}, events: map[string]int64{}, start: time.Now()}
+}
+
+// Record logs one operation call: its name, domain/transport code and latency.
+func (r *Recorder) Record(op, code string, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s := r.ops[op]
+	if s == nil {
+		s = newOpStat()
+		r.ops[op] = s
+	}
+	s.record(code, d)
+}
+
+// Event logs one received live event of the given kind.
+func (r *Recorder) Event(kind string) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.events[kind]++
+}
+
+// StreamErr logs one Subscribe stream error (a drop the player reconnects from).
+func (r *Recorder) StreamErr() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.streamErrs++
+}
+
+// Totals returns the aggregate call count and the count of non-"ok" results, for the
+// pass/fail summary.
+func (r *Recorder) Totals() (calls, nonOK int64) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	for _, s := range r.ops {
+		calls += s.count
+		for code, n := range s.codes {
+			if code != "ok" {
+				nonOK += n
+			}
+		}
+	}
+	return calls, nonOK
+}
+
+// Summary renders the human-readable run report: a per-operation table (count,
+// throughput, p50/p90/p99/max latency, code breakdown), the live-event tally and the
+// aggregate error rate.
+func (r *Recorder) Summary() string {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	elapsed := time.Since(r.start).Seconds()
+	if elapsed <= 0 {
+		elapsed = 1
+	}
+	names := make([]string, 0, len(r.ops))
+	for op := range r.ops {
+		names = append(names, op)
+	}
+	sort.Strings(names)
+
+	var b strings.Builder
+	fmt.Fprintf(&b, "elapsed: %.0fs\n\n", elapsed)
+	fmt.Fprintf(&b, "%-20s %8s %8s %7s %7s %7s %7s  %s\n",
+		"operation", "count", "req/s", "p50ms", "p90ms", "p99ms", "maxms", "codes")
+	var totalCalls, totalNonOK int64
+	for _, op := range names {
+		s := r.ops[op]
+		totalCalls += s.count
+		var nonOK int64
+		for code, n := range s.codes {
+			if code != "ok" {
+				nonOK += n
+			}
+		}
+		totalNonOK += nonOK
+		fmt.Fprintf(&b, "%-20s %8d %8.1f %7s %7s %7s %7.0f  %s\n",
+			op, s.count, float64(s.count)/elapsed,
+			s.quantile(0.50), s.quantile(0.90), s.quantile(0.99),
+			float64(s.maxNs)/float64(time.Millisecond), codeBreakdown(s.codes))
+	}
+
+	fmt.Fprintf(&b, "\ntotal calls: %d, throughput: %.1f req/s\n", totalCalls, float64(totalCalls)/elapsed)
+	rate := 0.0
+	if totalCalls > 0 {
+		rate = 100 * float64(totalNonOK) / float64(totalCalls)
+	}
+	fmt.Fprintf(&b, "non-ok results: %d (%.2f%%)\n", totalNonOK, rate)
+
+	if len(r.events) > 0 {
+		fmt.Fprintf(&b, "\nlive events:\n")
+		ekeys := make([]string, 0, len(r.events))
+		for k := range r.events {
+			ekeys = append(ekeys, k)
+		}
+		sort.Strings(ekeys)
+		for _, k := range ekeys {
+			fmt.Fprintf(&b, "  %-16s %d\n", k, r.events[k])
+		}
+	}
+	fmt.Fprintf(&b, "stream errors: %d\n", r.streamErrs)
+	return b.String()
+}
+
+// codeBreakdown renders a stat's code counts as "ok:1234 not_your_turn:5 ...",
+// highest-count first.
+func codeBreakdown(codes map[string]int64) string {
+	type kv struct {
+		code string
+		n    int64
+	}
+	pairs := make([]kv, 0, len(codes))
+	for c, n := range codes {
+		pairs = append(pairs, kv{c, n})
+	}
+	sort.Slice(pairs, func(i, j int) bool {
+		if pairs[i].n != pairs[j].n {
+			return pairs[i].n > pairs[j].n
+		}
+		return pairs[i].code < pairs[j].code
+	})
+	parts := make([]string, len(pairs))
+	for i, p := range pairs {
+		parts[i] = fmt.Sprintf("%s:%d", p.code, p.n)
+	}
+	return strings.Join(parts, " ")
+}