scrabble-game/loadtest/internal/report/report.go

// Package report collects per-operation latency, result-code and live-event counts
// across all virtual players and renders a text summary for the R2 trip report. It
// is safe for concurrent use. Latencies go into fixed buckets (a Prometheus-style
// histogram) so percentiles cost no per-sample memory at load-test scale.
package report

import (
	"fmt"
	"sort"
	"strings"
	"sync"
	"time"
)

// boundsMs are the upper bounds (milliseconds) of the latency histogram buckets; a
// trailing overflow bucket catches anything slower.
var boundsMs = []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000}

type opStat struct {
	count   int64
	sumNs   int64
	maxNs   int64
	codes   map[string]int64
	buckets []int64 // len(boundsMs)+1; last is the overflow bucket
}

func newOpStat() *opStat {
	return &opStat{codes: map[string]int64{}, buckets: make([]int64, len(boundsMs)+1)}
}

func (s *opStat) record(code string, d time.Duration) {
	s.count++
	s.sumNs += int64(d)
	if int64(d) > s.maxNs {
		s.maxNs = int64(d)
	}
	s.codes[code]++
	ms := float64(d) / float64(time.Millisecond)
	i := sort.SearchFloat64s(boundsMs, ms)
	s.buckets[i]++
}

// quantile estimates the q-th percentile (0<q<1) as the upper bound of the bucket
// the q-th sample falls in; the overflow bucket renders as ">5000".
func (s *opStat) quantile(q float64) string {
	if s.count == 0 {
		return "-"
	}
	target := int64(q*float64(s.count) + 0.5)
	if target < 1 {
		target = 1
	}
	var cum int64
	for i, n := range s.buckets {
		cum += n
		if cum >= target {
			if i == len(boundsMs) {
				return ">5000"
			}
			return fmt.Sprintf("%g", boundsMs[i])
		}
	}
	return ">5000"
}

// Recorder accumulates the run's measurements.
type Recorder struct {
	mu         sync.Mutex
	ops        map[string]*opStat
	events     map[string]int64
	streamErrs int64
	start      time.Time
}

// New returns an empty Recorder with the run clock started.
func New() *Recorder {
	return &Recorder{ops: map[string]*opStat{}, events: map[string]int64{}, start: time.Now()}
}

// Record logs one operation call: its name, domain/transport code and latency.
func (r *Recorder) Record(op, code string, d time.Duration) {
	r.mu.Lock()
	defer r.mu.Unlock()
	s := r.ops[op]
	if s == nil {
		s = newOpStat()
		r.ops[op] = s
	}
	s.record(code, d)
}

// Event logs one received live event of the given kind.
func (r *Recorder) Event(kind string) {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.events[kind]++
}

// StreamErr logs one Subscribe stream error (a drop the player reconnects from).
func (r *Recorder) StreamErr() {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.streamErrs++
}

// Totals returns the aggregate call count and the count of non-"ok" results, for the
// pass/fail summary.
func (r *Recorder) Totals() (calls, nonOK int64) {
	r.mu.Lock()
	defer r.mu.Unlock()
	for _, s := range r.ops {
		calls += s.count
		for code, n := range s.codes {
			if code != "ok" {
				nonOK += n
			}
		}
	}
	return calls, nonOK
}

// Summary renders the human-readable run report: a per-operation table (count,
// throughput, p50/p90/p99/max latency, code breakdown), the live-event tally and the
// aggregate error rate.
func (r *Recorder) Summary() string {
	r.mu.Lock()
	defer r.mu.Unlock()

	elapsed := time.Since(r.start).Seconds()
	if elapsed <= 0 {
		elapsed = 1
	}
	names := make([]string, 0, len(r.ops))
	for op := range r.ops {
		names = append(names, op)
	}
	sort.Strings(names)

	var b strings.Builder
	fmt.Fprintf(&b, "elapsed: %.0fs\n\n", elapsed)
	fmt.Fprintf(&b, "%-20s %8s %8s %7s %7s %7s %7s  %s\n",
		"operation", "count", "req/s", "p50ms", "p90ms", "p99ms", "maxms", "codes")
	var totalCalls, totalNonOK int64
	for _, op := range names {
		s := r.ops[op]
		totalCalls += s.count
		var nonOK int64
		for code, n := range s.codes {
			if code != "ok" {
				nonOK += n
			}
		}
		totalNonOK += nonOK
		fmt.Fprintf(&b, "%-20s %8d %8.1f %7s %7s %7s %7.0f  %s\n",
			op, s.count, float64(s.count)/elapsed,
			s.quantile(0.50), s.quantile(0.90), s.quantile(0.99),
			float64(s.maxNs)/float64(time.Millisecond), codeBreakdown(s.codes))
	}

	fmt.Fprintf(&b, "\ntotal calls: %d, throughput: %.1f req/s\n", totalCalls, float64(totalCalls)/elapsed)
	rate := 0.0
	if totalCalls > 0 {
		rate = 100 * float64(totalNonOK) / float64(totalCalls)
	}
	fmt.Fprintf(&b, "non-ok results: %d (%.2f%%)\n", totalNonOK, rate)

	if len(r.events) > 0 {
		fmt.Fprintf(&b, "\nlive events:\n")
		ekeys := make([]string, 0, len(r.events))
		for k := range r.events {
			ekeys = append(ekeys, k)
		}
		sort.Strings(ekeys)
		for _, k := range ekeys {
			fmt.Fprintf(&b, "  %-16s %d\n", k, r.events[k])
		}
	}
	fmt.Fprintf(&b, "stream errors: %d\n", r.streamErrs)
	return b.String()
}

// codeBreakdown renders a stat's code counts as "ok:1234 not_your_turn:5 ...",
// highest-count first.
func codeBreakdown(codes map[string]int64) string {
	type kv struct {
		code string
		n    int64
	}
	pairs := make([]kv, 0, len(codes))
	for c, n := range codes {
		pairs = append(pairs, kv{c, n})
	}
	sort.Slice(pairs, func(i, j int) bool {
		if pairs[i].n != pairs[j].n {
			return pairs[i].n > pairs[j].n
		}
		return pairs[i].code < pairs[j].code
	})
	parts := make([]string, len(pairs))
	for i, p := range pairs {
		parts[i] = fmt.Sprintf("%s:%d", p.code, p.n)
	}
	return strings.Join(parts, " ")
}