R2: load-test harness + contour resource observability
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 9s
CI / integration (pull_request) Successful in 11s
CI / ui (pull_request) Successful in 38s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Failing after 3s
CI / changes (pull_request) Successful in 2s
CI / unit (pull_request) Successful in 9s
CI / integration (pull_request) Successful in 11s
CI / ui (pull_request) Successful in 38s
CI / gate (pull_request) Successful in 0s
CI / deploy (pull_request) Failing after 3s
New scrabble/loadtest module (the pre-release stress harness): seeds 1000 guest + 10000 durable accounts with pre-created sessions directly in Postgres (token hash matches backend/internal/session), drives virtual players through the edge protocol (real 2-4p games assembled via invitations, mid-ranked legal moves generated locally by the embedded scrabble-solver — the edge carries no board, so the client replays history), plus nudge/chat/check-word/draft/profile/stats and a gateway-hammer that verifies the rate limiter. Prints a trip-report summary (per-op latency percentiles, result codes, live-event tally). Go unit tests cover the pure pieces; the DAWG-backed move test runs under BACKEND_DICT_DIR. Contour: add cAdvisor + postgres_exporter + a 'Scrabble - Resources' Grafana dashboard and the two Prometheus scrape jobs, for the R2/R7 stress-run resource baseline. CI: gate ./loadtest/... (path filter + vet/build/test). Docs: TESTING, ARCHITECTURE, project CLAUDE repo layout.
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
// Package report collects per-operation latency, result-code and live-event counts
|
||||
// across all virtual players and renders a text summary for the R2 trip report. It
|
||||
// is safe for concurrent use. Latencies go into fixed buckets (a Prometheus-style
|
||||
// histogram) so percentiles cost no per-sample memory at load-test scale.
|
||||
package report
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// boundsMs are the upper bounds (milliseconds) of the latency histogram buckets; a
|
||||
// trailing overflow bucket catches anything slower.
|
||||
var boundsMs = []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000}
|
||||
|
||||
type opStat struct {
|
||||
count int64
|
||||
sumNs int64
|
||||
maxNs int64
|
||||
codes map[string]int64
|
||||
buckets []int64 // len(boundsMs)+1; last is the overflow bucket
|
||||
}
|
||||
|
||||
func newOpStat() *opStat {
|
||||
return &opStat{codes: map[string]int64{}, buckets: make([]int64, len(boundsMs)+1)}
|
||||
}
|
||||
|
||||
func (s *opStat) record(code string, d time.Duration) {
|
||||
s.count++
|
||||
s.sumNs += int64(d)
|
||||
if int64(d) > s.maxNs {
|
||||
s.maxNs = int64(d)
|
||||
}
|
||||
s.codes[code]++
|
||||
ms := float64(d) / float64(time.Millisecond)
|
||||
i := sort.SearchFloat64s(boundsMs, ms)
|
||||
s.buckets[i]++
|
||||
}
|
||||
|
||||
// quantile estimates the q-th percentile (0<q<1) as the upper bound of the bucket
|
||||
// the q-th sample falls in; the overflow bucket renders as ">5000".
|
||||
func (s *opStat) quantile(q float64) string {
|
||||
if s.count == 0 {
|
||||
return "-"
|
||||
}
|
||||
target := int64(q*float64(s.count) + 0.5)
|
||||
if target < 1 {
|
||||
target = 1
|
||||
}
|
||||
var cum int64
|
||||
for i, n := range s.buckets {
|
||||
cum += n
|
||||
if cum >= target {
|
||||
if i == len(boundsMs) {
|
||||
return ">5000"
|
||||
}
|
||||
return fmt.Sprintf("%g", boundsMs[i])
|
||||
}
|
||||
}
|
||||
return ">5000"
|
||||
}
|
||||
|
||||
// Recorder accumulates the run's measurements.
|
||||
type Recorder struct {
|
||||
mu sync.Mutex
|
||||
ops map[string]*opStat
|
||||
events map[string]int64
|
||||
streamErrs int64
|
||||
start time.Time
|
||||
}
|
||||
|
||||
// New returns an empty Recorder with the run clock started.
|
||||
func New() *Recorder {
|
||||
return &Recorder{ops: map[string]*opStat{}, events: map[string]int64{}, start: time.Now()}
|
||||
}
|
||||
|
||||
// Record logs one operation call: its name, domain/transport code and latency.
|
||||
func (r *Recorder) Record(op, code string, d time.Duration) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
s := r.ops[op]
|
||||
if s == nil {
|
||||
s = newOpStat()
|
||||
r.ops[op] = s
|
||||
}
|
||||
s.record(code, d)
|
||||
}
|
||||
|
||||
// Event logs one received live event of the given kind.
|
||||
func (r *Recorder) Event(kind string) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
r.events[kind]++
|
||||
}
|
||||
|
||||
// StreamErr logs one Subscribe stream error (a drop the player reconnects from).
|
||||
func (r *Recorder) StreamErr() {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
r.streamErrs++
|
||||
}
|
||||
|
||||
// Totals returns the aggregate call count and the count of non-"ok" results, for the
|
||||
// pass/fail summary.
|
||||
func (r *Recorder) Totals() (calls, nonOK int64) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
for _, s := range r.ops {
|
||||
calls += s.count
|
||||
for code, n := range s.codes {
|
||||
if code != "ok" {
|
||||
nonOK += n
|
||||
}
|
||||
}
|
||||
}
|
||||
return calls, nonOK
|
||||
}
|
||||
|
||||
// Summary renders the human-readable run report: a per-operation table (count,
|
||||
// throughput, p50/p90/p99/max latency, code breakdown), the live-event tally and the
|
||||
// aggregate error rate.
|
||||
func (r *Recorder) Summary() string {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
elapsed := time.Since(r.start).Seconds()
|
||||
if elapsed <= 0 {
|
||||
elapsed = 1
|
||||
}
|
||||
names := make([]string, 0, len(r.ops))
|
||||
for op := range r.ops {
|
||||
names = append(names, op)
|
||||
}
|
||||
sort.Strings(names)
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "elapsed: %.0fs\n\n", elapsed)
|
||||
fmt.Fprintf(&b, "%-20s %8s %8s %7s %7s %7s %7s %s\n",
|
||||
"operation", "count", "req/s", "p50ms", "p90ms", "p99ms", "maxms", "codes")
|
||||
var totalCalls, totalNonOK int64
|
||||
for _, op := range names {
|
||||
s := r.ops[op]
|
||||
totalCalls += s.count
|
||||
var nonOK int64
|
||||
for code, n := range s.codes {
|
||||
if code != "ok" {
|
||||
nonOK += n
|
||||
}
|
||||
}
|
||||
totalNonOK += nonOK
|
||||
fmt.Fprintf(&b, "%-20s %8d %8.1f %7s %7s %7s %7.0f %s\n",
|
||||
op, s.count, float64(s.count)/elapsed,
|
||||
s.quantile(0.50), s.quantile(0.90), s.quantile(0.99),
|
||||
float64(s.maxNs)/float64(time.Millisecond), codeBreakdown(s.codes))
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "\ntotal calls: %d, throughput: %.1f req/s\n", totalCalls, float64(totalCalls)/elapsed)
|
||||
rate := 0.0
|
||||
if totalCalls > 0 {
|
||||
rate = 100 * float64(totalNonOK) / float64(totalCalls)
|
||||
}
|
||||
fmt.Fprintf(&b, "non-ok results: %d (%.2f%%)\n", totalNonOK, rate)
|
||||
|
||||
if len(r.events) > 0 {
|
||||
fmt.Fprintf(&b, "\nlive events:\n")
|
||||
ekeys := make([]string, 0, len(r.events))
|
||||
for k := range r.events {
|
||||
ekeys = append(ekeys, k)
|
||||
}
|
||||
sort.Strings(ekeys)
|
||||
for _, k := range ekeys {
|
||||
fmt.Fprintf(&b, " %-16s %d\n", k, r.events[k])
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "stream errors: %d\n", r.streamErrs)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// codeBreakdown renders a stat's code counts as "ok:1234 not_your_turn:5 ...",
|
||||
// highest-count first.
|
||||
func codeBreakdown(codes map[string]int64) string {
|
||||
type kv struct {
|
||||
code string
|
||||
n int64
|
||||
}
|
||||
pairs := make([]kv, 0, len(codes))
|
||||
for c, n := range codes {
|
||||
pairs = append(pairs, kv{c, n})
|
||||
}
|
||||
sort.Slice(pairs, func(i, j int) bool {
|
||||
if pairs[i].n != pairs[j].n {
|
||||
return pairs[i].n > pairs[j].n
|
||||
}
|
||||
return pairs[i].code < pairs[j].code
|
||||
})
|
||||
parts := make([]string, len(pairs))
|
||||
for i, p := range pairs {
|
||||
parts[i] = fmt.Sprintf("%s:%d", p.code, p.n)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
Reference in New Issue
Block a user