feat: gamemaster
This commit is contained in:
@@ -0,0 +1,556 @@
|
||||
// Package healtheventsconsumer implements the worker that consumes
|
||||
// `runtime:health_events` from Runtime Manager and propagates engine
|
||||
// health observations into Game Master state.
|
||||
//
|
||||
// On every consumed entry the worker:
|
||||
//
|
||||
// 1. Updates `runtime_records.engine_health` per game with a short
|
||||
// summary string (`healthy`, `probe_failed`, `inspect_unhealthy`,
|
||||
// `exited`, `oom`, `disappeared`).
|
||||
// 2. For terminal container events (`container_exited`,
|
||||
// `container_oom`, `container_disappeared`) attempts a
|
||||
// compare-and-swap `running → engine_unreachable`. For
|
||||
// `probe_recovered` attempts the symmetric recovery CAS
|
||||
// `engine_unreachable → running`. Both transitions are pre-declared
|
||||
// in `domain/runtime/transitions.go`. CAS conflicts (record not in
|
||||
// the expected source status) fall back to a health-only update so
|
||||
// the summary stays current even when another flow (turn
|
||||
// generation, admin op) holds the status.
|
||||
// 3. Publishes a `runtime_snapshot_update` on `gm:lobby_events` only
|
||||
// when the status transitioned or when the engine-health summary
|
||||
// differs from the previously emitted one for the same game. The
|
||||
// last-emitted summary is tracked in process memory; on restart
|
||||
// the cache is empty and the first event per game produces one
|
||||
// snapshot.
|
||||
//
|
||||
// The XREAD loop, offset handling, and shutdown semantics mirror the
|
||||
// Lobby `gmevents` consumer at `lobby/internal/worker/gmevents`.
|
||||
package healtheventsconsumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/gamemaster/internal/domain/runtime"
|
||||
"galaxy/gamemaster/internal/logging"
|
||||
"galaxy/gamemaster/internal/ports"
|
||||
"galaxy/gamemaster/internal/telemetry"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Wire field names on the `runtime:health_events` Redis Stream entry,
|
||||
// fixed by `rtmanager/api/runtime-health-asyncapi.yaml`. Renaming any
|
||||
// of them breaks the contract.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldEventType = "event_type"
|
||||
fieldOccurredAtMS = "occurred_at_ms"
|
||||
)
|
||||
|
||||
// RTM event-type values per
|
||||
// `rtmanager/internal/domain/health/snapshot.go`. Stage 18 maps all
|
||||
// seven (the PLAN enumerates six; container_started and
|
||||
// probe_recovered are added here).
|
||||
const (
|
||||
eventTypeContainerStarted = "container_started"
|
||||
eventTypeProbeRecovered = "probe_recovered"
|
||||
eventTypeProbeFailed = "probe_failed"
|
||||
eventTypeInspectUnhealthy = "inspect_unhealthy"
|
||||
eventTypeContainerExited = "container_exited"
|
||||
eventTypeContainerOOM = "container_oom"
|
||||
eventTypeContainerDisappeared = "container_disappeared"
|
||||
)
|
||||
|
||||
// engine_health summary strings written to `runtime_records.engine_health`.
|
||||
const (
|
||||
summaryHealthy = "healthy"
|
||||
summaryProbeFailed = "probe_failed"
|
||||
summaryInspectUnhealthy = "inspect_unhealthy"
|
||||
summaryExited = "exited"
|
||||
summaryOOM = "oom"
|
||||
summaryDisappeared = "disappeared"
|
||||
)
|
||||
|
||||
// snapshotEventType is the discriminator written by
|
||||
// `LobbyEventsPublisher.PublishSnapshotUpdate` and recorded on the
|
||||
// `gamemaster.lobby_events.published` counter.
|
||||
const snapshotEventType = "runtime_snapshot_update"
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// Client provides XREAD access to the runtime:health_events stream.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Streams key consumed by the worker
|
||||
// (typically `runtime:health_events`).
|
||||
Stream string
|
||||
|
||||
// StreamLabel identifies the consumer in the stream-offset store.
|
||||
// Defaults to `health_events` when empty.
|
||||
StreamLabel string
|
||||
|
||||
// BlockTimeout bounds the blocking XREAD window. Required positive.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// OffsetStore persists the last successfully processed entry id.
|
||||
OffsetStore ports.StreamOffsetStore
|
||||
|
||||
// RuntimeRecords is mutated on every observation.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// LobbyEvents publishes the debounced `runtime_snapshot_update`
|
||||
// messages that propagate health summary changes to Game Lobby.
|
||||
LobbyEvents ports.LobbyEventsPublisher
|
||||
|
||||
// Telemetry receives one consumed-event count per processed entry
|
||||
// and one published-event count per emitted snapshot. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Clock supplies the wall-clock used for store updates and for
|
||||
// `RuntimeSnapshotUpdate.OccurredAt`. Defaults to `time.Now` when
|
||||
// nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// defaultStreamLabel is used when Dependencies.StreamLabel is empty.
|
||||
const defaultStreamLabel = "health_events"
|
||||
|
||||
// Worker drives the runtime:health_events processing loop.
|
||||
type Worker struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
streamLabel string
|
||||
blockTimeout time.Duration
|
||||
offsetStore ports.StreamOffsetStore
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
lobbyEvents ports.LobbyEventsPublisher
|
||||
telemetry *telemetry.Runtime
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.RWMutex
|
||||
lastEmittedSummary map[string]string
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.Client == nil:
|
||||
return nil, errors.New("new health events consumer: nil redis client")
|
||||
case strings.TrimSpace(deps.Stream) == "":
|
||||
return nil, errors.New("new health events consumer: stream must not be empty")
|
||||
case deps.BlockTimeout <= 0:
|
||||
return nil, errors.New("new health events consumer: block timeout must be positive")
|
||||
case deps.OffsetStore == nil:
|
||||
return nil, errors.New("new health events consumer: nil offset store")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new health events consumer: nil runtime records store")
|
||||
case deps.LobbyEvents == nil:
|
||||
return nil, errors.New("new health events consumer: nil lobby events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new health events consumer: nil telemetry runtime")
|
||||
}
|
||||
|
||||
streamLabel := strings.TrimSpace(deps.StreamLabel)
|
||||
if streamLabel == "" {
|
||||
streamLabel = defaultStreamLabel
|
||||
}
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
client: deps.Client,
|
||||
stream: deps.Stream,
|
||||
streamLabel: streamLabel,
|
||||
blockTimeout: deps.BlockTimeout,
|
||||
offsetStore: deps.OffsetStore,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
lobbyEvents: deps.LobbyEvents,
|
||||
telemetry: deps.Telemetry,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "gamemaster.healtheventsconsumer", "stream", deps.Stream),
|
||||
lastEmittedSummary: make(map[string]string),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the XREAD loop until ctx is cancelled. The offset advances
|
||||
// only after a successful HandleMessage call. The loop exits on context
|
||||
// cancellation or a fatal Redis error.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run health events consumer: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run health events consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lastID, found, err := worker.offsetStore.Load(ctx, worker.streamLabel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run health events consumer: load offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
worker.logger.Info("health events consumer started",
|
||||
"block_timeout", worker.blockTimeout.String(),
|
||||
"start_entry_id", lastID,
|
||||
)
|
||||
defer worker.logger.Info("health events consumer stopped")
|
||||
|
||||
for {
|
||||
streams, err := worker.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{worker.stream, lastID},
|
||||
Count: 1,
|
||||
Block: worker.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
if !worker.HandleMessage(ctx, message) {
|
||||
continue
|
||||
}
|
||||
if err := worker.offsetStore.Save(ctx, worker.streamLabel, message.ID); err != nil {
|
||||
return fmt.Errorf("run health events consumer: save offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run health events consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run health events consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; the worker relies on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown health events consumer: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HandleMessage processes one Redis Stream entry and reports whether
|
||||
// the offset is allowed to advance. Decode errors and orphan game ids
|
||||
// return true so the offset advances past the entry; only fatal store
|
||||
// or publisher failures return false (currently never — every error is
|
||||
// logged and absorbed, the offset always advances after the entry has
|
||||
// been observed).
|
||||
//
|
||||
// Exported so tests can drive the worker deterministically without
|
||||
// spinning up a real XREAD loop.
|
||||
func (worker *Worker) HandleMessage(ctx context.Context, message redis.XMessage) bool {
|
||||
if worker == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
event, err := decodeEvent(message)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "decode runtime health event",
|
||||
"stream_entry_id", message.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
plan, ok := planFor(event.EventType)
|
||||
if !ok {
|
||||
worker.logger.WarnContext(ctx, "unknown runtime health event type",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", event.GameID,
|
||||
"event_type", event.EventType,
|
||||
)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
now := worker.clock().UTC()
|
||||
|
||||
current, err := worker.runtimeRecords.Get(ctx, event.GameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
worker.logger.WarnContext(ctx, "runtime health event for unknown game",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", event.GameID,
|
||||
"event_type", event.EventType,
|
||||
)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
worker.logger.WarnContext(ctx, "load runtime record for health event",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", event.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
statusChanged := worker.applyMutation(ctx, message.ID, current, plan, now)
|
||||
|
||||
if !worker.shouldPublish(event.GameID, plan.summary, statusChanged) {
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
refreshed, err := worker.runtimeRecords.Get(ctx, event.GameID)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "reload runtime record for snapshot",
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", event.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
snapshot := ports.RuntimeSnapshotUpdate{
|
||||
GameID: refreshed.GameID,
|
||||
CurrentTurn: refreshed.CurrentTurn,
|
||||
RuntimeStatus: refreshed.Status,
|
||||
EngineHealthSummary: refreshed.EngineHealth,
|
||||
PlayerTurnStats: nil,
|
||||
OccurredAt: now,
|
||||
}
|
||||
if err := worker.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
||||
logArgs := []any{
|
||||
"stream_entry_id", message.ID,
|
||||
"game_id", event.GameID,
|
||||
"err", err.Error(),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
worker.logger.WarnContext(ctx, "publish runtime snapshot update", logArgs...)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
worker.telemetry.RecordLobbyEventPublished(ctx, snapshotEventType)
|
||||
worker.rememberSummary(event.GameID, plan.summary)
|
||||
worker.telemetry.RecordHealthEventConsumed(ctx)
|
||||
return true
|
||||
}
|
||||
|
||||
// applyMutation applies the plan to the runtime record. When plan.transition
|
||||
// is set, the worker first attempts a CAS UpdateStatus from the expected
|
||||
// source status; on conflict or invalid-transition it falls back to a
|
||||
// health-only UpdateEngineHealth. When plan.transition is nil only
|
||||
// UpdateEngineHealth runs. Returns true when the status was actually
|
||||
// transitioned.
|
||||
func (worker *Worker) applyMutation(
|
||||
ctx context.Context,
|
||||
entryID string,
|
||||
current runtime.RuntimeRecord,
|
||||
plan eventPlan,
|
||||
now time.Time,
|
||||
) bool {
|
||||
if plan.transition != nil {
|
||||
summary := plan.summary
|
||||
err := worker.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: current.GameID,
|
||||
ExpectedFrom: plan.transition.from,
|
||||
To: plan.transition.to,
|
||||
Now: now,
|
||||
EngineHealthSummary: &summary,
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
worker.logger.InfoContext(ctx, "runtime status transitioned by health event",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", current.GameID,
|
||||
"from_status", string(plan.transition.from),
|
||||
"to_status", string(plan.transition.to),
|
||||
"engine_health", plan.summary,
|
||||
)
|
||||
return true
|
||||
case errors.Is(err, runtime.ErrConflict), errors.Is(err, runtime.ErrInvalidTransition):
|
||||
worker.logger.DebugContext(ctx, "runtime status CAS conflict, falling back to health-only update",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", current.GameID,
|
||||
"current_status", string(current.Status),
|
||||
"expected_from", string(plan.transition.from),
|
||||
"engine_health", plan.summary,
|
||||
)
|
||||
default:
|
||||
worker.logger.WarnContext(ctx, "update runtime status from health event",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", current.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if err := worker.runtimeRecords.UpdateEngineHealth(ctx, ports.UpdateEngineHealthInput{
|
||||
GameID: current.GameID,
|
||||
EngineHealthSummary: plan.summary,
|
||||
Now: now,
|
||||
}); err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
||||
worker.logger.WarnContext(ctx, "update runtime engine health",
|
||||
"stream_entry_id", entryID,
|
||||
"game_id", current.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// shouldPublish returns whether a snapshot must be emitted: either the
|
||||
// status changed in this iteration, or the engine_health summary
|
||||
// differs from the last summary published for this game.
|
||||
func (worker *Worker) shouldPublish(gameID, summary string, statusChanged bool) bool {
|
||||
if statusChanged {
|
||||
return true
|
||||
}
|
||||
worker.mu.RLock()
|
||||
last, ok := worker.lastEmittedSummary[gameID]
|
||||
worker.mu.RUnlock()
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
return last != summary
|
||||
}
|
||||
|
||||
// rememberSummary stores the latest published summary for gameID.
|
||||
func (worker *Worker) rememberSummary(gameID, summary string) {
|
||||
worker.mu.Lock()
|
||||
worker.lastEmittedSummary[gameID] = summary
|
||||
worker.mu.Unlock()
|
||||
}
|
||||
|
||||
// healthEvent stores the decoded XADD entry shared across handlers.
|
||||
type healthEvent struct {
|
||||
GameID string
|
||||
EventType string
|
||||
OccurredAt time.Time
|
||||
}
|
||||
|
||||
// decodeEvent parses a Redis Stream message into a healthEvent. Missing
|
||||
// or malformed required fields produce an error.
|
||||
func decodeEvent(message redis.XMessage) (healthEvent, error) {
|
||||
gameID := optionalString(message.Values, fieldGameID)
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return healthEvent{}, errors.New("missing game_id")
|
||||
}
|
||||
eventType := optionalString(message.Values, fieldEventType)
|
||||
if strings.TrimSpace(eventType) == "" {
|
||||
return healthEvent{}, errors.New("missing event_type")
|
||||
}
|
||||
occurredAtMSRaw := optionalString(message.Values, fieldOccurredAtMS)
|
||||
if strings.TrimSpace(occurredAtMSRaw) == "" {
|
||||
return healthEvent{}, errors.New("missing occurred_at_ms")
|
||||
}
|
||||
occurredAtMS, err := strconv.ParseInt(occurredAtMSRaw, 10, 64)
|
||||
if err != nil {
|
||||
return healthEvent{}, fmt.Errorf("invalid occurred_at_ms: %w", err)
|
||||
}
|
||||
if occurredAtMS <= 0 {
|
||||
return healthEvent{}, errors.New("invalid occurred_at_ms: must be positive")
|
||||
}
|
||||
return healthEvent{
|
||||
GameID: gameID,
|
||||
EventType: eventType,
|
||||
OccurredAt: time.UnixMilli(occurredAtMS).UTC(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// transitionPlan encodes one allowed CAS pair. nil-transition events
|
||||
// only update the summary.
|
||||
type transitionPlan struct {
|
||||
from runtime.Status
|
||||
to runtime.Status
|
||||
}
|
||||
|
||||
// eventPlan is the decoded reaction to one event_type.
|
||||
type eventPlan struct {
|
||||
summary string
|
||||
transition *transitionPlan
|
||||
}
|
||||
|
||||
// planFor returns the eventPlan registered for eventType. The boolean
|
||||
// reports whether the type is recognised.
|
||||
func planFor(eventType string) (eventPlan, bool) {
|
||||
switch eventType {
|
||||
case eventTypeContainerStarted:
|
||||
return eventPlan{summary: summaryHealthy}, true
|
||||
case eventTypeProbeRecovered:
|
||||
return eventPlan{
|
||||
summary: summaryHealthy,
|
||||
transition: &transitionPlan{
|
||||
from: runtime.StatusEngineUnreachable,
|
||||
to: runtime.StatusRunning,
|
||||
},
|
||||
}, true
|
||||
case eventTypeProbeFailed:
|
||||
return eventPlan{summary: summaryProbeFailed}, true
|
||||
case eventTypeInspectUnhealthy:
|
||||
return eventPlan{summary: summaryInspectUnhealthy}, true
|
||||
case eventTypeContainerExited:
|
||||
return eventPlan{
|
||||
summary: summaryExited,
|
||||
transition: &transitionPlan{
|
||||
from: runtime.StatusRunning,
|
||||
to: runtime.StatusEngineUnreachable,
|
||||
},
|
||||
}, true
|
||||
case eventTypeContainerOOM:
|
||||
return eventPlan{
|
||||
summary: summaryOOM,
|
||||
transition: &transitionPlan{
|
||||
from: runtime.StatusRunning,
|
||||
to: runtime.StatusEngineUnreachable,
|
||||
},
|
||||
}, true
|
||||
case eventTypeContainerDisappeared:
|
||||
return eventPlan{
|
||||
summary: summaryDisappeared,
|
||||
transition: &transitionPlan{
|
||||
from: runtime.StatusRunning,
|
||||
to: runtime.StatusEngineUnreachable,
|
||||
},
|
||||
}, true
|
||||
default:
|
||||
return eventPlan{}, false
|
||||
}
|
||||
}
|
||||
|
||||
func optionalString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,636 @@
|
||||
package healtheventsconsumer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/gamemaster/internal/adapters/mocks"
|
||||
"galaxy/gamemaster/internal/domain/runtime"
|
||||
"galaxy/gamemaster/internal/ports"
|
||||
"galaxy/gamemaster/internal/telemetry"
|
||||
"galaxy/gamemaster/internal/worker/healtheventsconsumer"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
const (
|
||||
testStream = "runtime:health_events"
|
||||
testLabel = "health_events"
|
||||
)
|
||||
|
||||
func newTestTelemetry(t *testing.T) *telemetry.Runtime {
|
||||
t.Helper()
|
||||
tm, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
return tm
|
||||
}
|
||||
|
||||
// runningRecord builds a runtime_records row in `running` with a known
|
||||
// engine_health value. The seed simplifies expectations on Get reads.
|
||||
func runningRecord(gameID, health string) runtime.RuntimeRecord {
|
||||
created := time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC)
|
||||
startedAt := created.Add(time.Second)
|
||||
nextGen := created.Add(time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
CurrentImageRef: "ghcr.io/galaxy/game:v1.2.3",
|
||||
CurrentEngineVersion: "v1.2.3",
|
||||
TurnSchedule: "0 18 * * *",
|
||||
CurrentTurn: 5,
|
||||
NextGenerationAt: &nextGen,
|
||||
EngineHealth: health,
|
||||
CreatedAt: created,
|
||||
UpdatedAt: startedAt,
|
||||
StartedAt: &startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func unreachableRecord(gameID, health string) runtime.RuntimeRecord {
|
||||
rec := runningRecord(gameID, health)
|
||||
rec.Status = runtime.StatusEngineUnreachable
|
||||
return rec
|
||||
}
|
||||
|
||||
// withSummary returns a copy of rec with EngineHealth replaced.
|
||||
func withSummary(rec runtime.RuntimeRecord, summary string) runtime.RuntimeRecord {
|
||||
rec.EngineHealth = summary
|
||||
return rec
|
||||
}
|
||||
|
||||
// withStatus returns a copy of rec with Status replaced.
|
||||
func withStatus(rec runtime.RuntimeRecord, status runtime.Status) runtime.RuntimeRecord {
|
||||
rec.Status = status
|
||||
return rec
|
||||
}
|
||||
|
||||
// xMessage builds a redis.XMessage with the wire field layout used by
|
||||
// RTM's healtheventspublisher.
|
||||
func xMessage(id, gameID, eventType string, occurredAt time.Time) redis.XMessage {
|
||||
return redis.XMessage{
|
||||
ID: id,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"event_type": eventType,
|
||||
"occurred_at_ms": strconv.FormatInt(occurredAt.UnixMilli(), 10),
|
||||
"details": "{}",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// newWorker constructs a worker with mocked dependencies. The returned
|
||||
// pointers are mocks; gomock.Controller is owned by the test.
|
||||
type harness struct {
|
||||
worker *healtheventsconsumer.Worker
|
||||
store *mocks.MockRuntimeRecordStore
|
||||
publisher *mocks.MockLobbyEventsPublisher
|
||||
offsetStore *mocks.MockStreamOffsetStore
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T, ctrl *gomock.Controller) *harness {
|
||||
t.Helper()
|
||||
now := time.Date(2026, time.May, 1, 13, 0, 0, 0, time.UTC)
|
||||
store := mocks.NewMockRuntimeRecordStore(ctrl)
|
||||
publisher := mocks.NewMockLobbyEventsPublisher(ctrl)
|
||||
offsetStore := mocks.NewMockStreamOffsetStore(ctrl)
|
||||
telem := newTestTelemetry(t)
|
||||
worker, err := healtheventsconsumer.NewWorker(healtheventsconsumer.Dependencies{
|
||||
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
|
||||
Stream: testStream,
|
||||
StreamLabel: testLabel,
|
||||
BlockTimeout: 100 * time.Millisecond,
|
||||
OffsetStore: offsetStore,
|
||||
RuntimeRecords: store,
|
||||
LobbyEvents: publisher,
|
||||
Telemetry: telem,
|
||||
Clock: func() time.Time { return now },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return &harness{
|
||||
worker: worker,
|
||||
store: store,
|
||||
publisher: publisher,
|
||||
offsetStore: offsetStore,
|
||||
now: now,
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewWorkerValidates exercises every required-dep branch.
|
||||
func TestNewWorkerValidates(t *testing.T) {
|
||||
telem := newTestTelemetry(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"})
|
||||
cases := []struct {
|
||||
name string
|
||||
mut func(*healtheventsconsumer.Dependencies)
|
||||
}{
|
||||
{"client", func(d *healtheventsconsumer.Dependencies) { d.Client = nil }},
|
||||
{"stream", func(d *healtheventsconsumer.Dependencies) { d.Stream = " " }},
|
||||
{"block timeout", func(d *healtheventsconsumer.Dependencies) { d.BlockTimeout = 0 }},
|
||||
{"offset store", func(d *healtheventsconsumer.Dependencies) { d.OffsetStore = nil }},
|
||||
{"runtime records", func(d *healtheventsconsumer.Dependencies) { d.RuntimeRecords = nil }},
|
||||
{"lobby events", func(d *healtheventsconsumer.Dependencies) { d.LobbyEvents = nil }},
|
||||
{"telemetry", func(d *healtheventsconsumer.Dependencies) { d.Telemetry = nil }},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
deps := healtheventsconsumer.Dependencies{
|
||||
Client: client,
|
||||
Stream: testStream,
|
||||
StreamLabel: testLabel,
|
||||
BlockTimeout: time.Second,
|
||||
OffsetStore: mocks.NewMockStreamOffsetStore(ctrl),
|
||||
RuntimeRecords: mocks.NewMockRuntimeRecordStore(ctrl),
|
||||
LobbyEvents: mocks.NewMockLobbyEventsPublisher(ctrl),
|
||||
Telemetry: telem,
|
||||
}
|
||||
tc.mut(&deps)
|
||||
worker, err := healtheventsconsumer.NewWorker(deps)
|
||||
require.Error(t, err)
|
||||
require.Nil(t, worker)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewWorkerDefaultsLabel(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
telem := newTestTelemetry(t)
|
||||
worker, err := healtheventsconsumer.NewWorker(healtheventsconsumer.Dependencies{
|
||||
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
|
||||
Stream: testStream,
|
||||
StreamLabel: "",
|
||||
BlockTimeout: time.Second,
|
||||
OffsetStore: mocks.NewMockStreamOffsetStore(ctrl),
|
||||
RuntimeRecords: mocks.NewMockRuntimeRecordStore(ctrl),
|
||||
LobbyEvents: mocks.NewMockLobbyEventsPublisher(ctrl),
|
||||
Telemetry: telem,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, worker)
|
||||
}
|
||||
|
||||
// TestHandleMessage_ContainerExited covers a terminal event from a
|
||||
// healthy `running` record: status transitions to engine_unreachable
|
||||
// and a snapshot is published.
|
||||
func TestHandleMessage_ContainerExited(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, "healthy"), nil)
|
||||
h.store.EXPECT().UpdateStatus(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
require.Equal(t, runtime.StatusRunning, input.ExpectedFrom)
|
||||
require.Equal(t, runtime.StatusEngineUnreachable, input.To)
|
||||
require.NotNil(t, input.EngineHealthSummary)
|
||||
require.Equal(t, "exited", *input.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(
|
||||
withStatus(withSummary(runningRecord(gameID, "healthy"), "exited"), runtime.StatusEngineUnreachable),
|
||||
nil,
|
||||
)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, snap ports.RuntimeSnapshotUpdate) error {
|
||||
assert.Equal(t, gameID, snap.GameID)
|
||||
assert.Equal(t, runtime.StatusEngineUnreachable, snap.RuntimeStatus)
|
||||
assert.Equal(t, "exited", snap.EngineHealthSummary)
|
||||
assert.Nil(t, snap.PlayerTurnStats)
|
||||
assert.Equal(t, h.now, snap.OccurredAt)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "container_exited", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_ProbeRecovered_Recovers demonstrates the symmetric
|
||||
// recovery: engine_unreachable → running, summary set to healthy.
|
||||
func TestHandleMessage_ProbeRecovered_Recovers(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(unreachableRecord(gameID, "exited"), nil)
|
||||
h.store.EXPECT().UpdateStatus(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
require.Equal(t, runtime.StatusEngineUnreachable, input.ExpectedFrom)
|
||||
require.Equal(t, runtime.StatusRunning, input.To)
|
||||
require.NotNil(t, input.EngineHealthSummary)
|
||||
require.Equal(t, "healthy", *input.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(
|
||||
withStatus(withSummary(unreachableRecord(gameID, "exited"), "healthy"), runtime.StatusRunning),
|
||||
nil,
|
||||
)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, snap ports.RuntimeSnapshotUpdate) error {
|
||||
assert.Equal(t, runtime.StatusRunning, snap.RuntimeStatus)
|
||||
assert.Equal(t, "healthy", snap.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "probe_recovered", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_ContainerStarted_NoTransition asserts that
|
||||
// container_started writes summary `healthy` without status mutation.
|
||||
func TestHandleMessage_ContainerStarted_NoTransition(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, ""), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, input ports.UpdateEngineHealthInput) error {
|
||||
assert.Equal(t, gameID, input.GameID)
|
||||
assert.Equal(t, "healthy", input.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, ""), "healthy"), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).Return(nil)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "container_started", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_ProbeFailed covers the non-transitional path:
|
||||
// summary is updated; status stays running.
|
||||
func TestHandleMessage_ProbeFailed(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, "healthy"), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).Return(nil)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, "healthy"), "probe_failed"), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, snap ports.RuntimeSnapshotUpdate) error {
|
||||
assert.Equal(t, runtime.StatusRunning, snap.RuntimeStatus)
|
||||
assert.Equal(t, "probe_failed", snap.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "probe_failed", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_FallsBackOnCASConflict — record is in
|
||||
// generation_in_progress (not running); CAS rejects with ErrConflict and
|
||||
// the worker falls back to UpdateEngineHealth + publishes a snapshot
|
||||
// because the summary changed.
|
||||
func TestHandleMessage_FallsBackOnCASConflict(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
current := withStatus(runningRecord(gameID, "healthy"), runtime.StatusGenerationInProgress)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(current, nil)
|
||||
h.store.EXPECT().UpdateStatus(gomock.Any(), gomock.Any()).Return(runtime.ErrConflict)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, input ports.UpdateEngineHealthInput) error {
|
||||
assert.Equal(t, "oom", input.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(current, "oom"), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, snap ports.RuntimeSnapshotUpdate) error {
|
||||
assert.Equal(t, runtime.StatusGenerationInProgress, snap.RuntimeStatus,
|
||||
"status must reflect the unchanged record after fallback")
|
||||
assert.Equal(t, "oom", snap.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "container_oom", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_DebouncesUnchangedSummary — two consecutive
|
||||
// probe_failed events for the same game yield exactly one snapshot
|
||||
// publication.
|
||||
func TestHandleMessage_DebouncesUnchangedSummary(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
// First event: store update + reload + publish.
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, "healthy"), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).Return(nil)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, "healthy"), "probe_failed"), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).Return(nil)
|
||||
|
||||
// Second event: store update happens, but no second Get and no
|
||||
// publication since the summary is unchanged.
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, "probe_failed"), "probe_failed"), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).Return(nil)
|
||||
|
||||
ctx := context.Background()
|
||||
require.True(t, h.worker.HandleMessage(ctx, xMessage("0-1", gameID, "probe_failed", h.now)))
|
||||
require.True(t, h.worker.HandleMessage(ctx, xMessage("0-2", gameID, "probe_failed", h.now)))
|
||||
}
|
||||
|
||||
// TestHandleMessage_OrphanGameID — Get returns ErrNotFound, no further
|
||||
// store calls, no publish, offset advances.
|
||||
func TestHandleMessage_OrphanGameID(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "missing-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runtime.RuntimeRecord{}, runtime.ErrNotFound)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "probe_failed", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_UnknownEventType — unrecognised event type yields
|
||||
// no store calls and no publication, but offset advances.
|
||||
func TestHandleMessage_UnknownEventType(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", "game-001", "future_event", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_MalformedOccurredAtMS — malformed wire payload is
|
||||
// logged + skipped without store calls.
|
||||
func TestHandleMessage_MalformedOccurredAtMS(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
|
||||
msg := redis.XMessage{
|
||||
ID: "0-1",
|
||||
Values: map[string]any{
|
||||
"game_id": "game-001",
|
||||
"event_type": "probe_failed",
|
||||
"occurred_at_ms": "not-a-number",
|
||||
},
|
||||
}
|
||||
advance := h.worker.HandleMessage(context.Background(), msg)
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_MissingFields — missing required wire field is
|
||||
// logged + skipped.
|
||||
func TestHandleMessage_MissingFields(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
msg redis.XMessage
|
||||
}{
|
||||
{"missing game_id", redis.XMessage{ID: "0-1", Values: map[string]any{"event_type": "probe_failed", "occurred_at_ms": "1"}}},
|
||||
{"missing event_type", redis.XMessage{ID: "0-1", Values: map[string]any{"game_id": "g", "occurred_at_ms": "1"}}},
|
||||
{"missing occurred_at_ms", redis.XMessage{ID: "0-1", Values: map[string]any{"game_id": "g", "event_type": "probe_failed"}}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
advance := h.worker.HandleMessage(context.Background(), tc.msg)
|
||||
assert.True(t, advance)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessage_PublishErrorAdvancesOffset — a publisher error is
|
||||
// logged and absorbed; the offset still advances so a transient hiccup
|
||||
// does not stall the consumer.
|
||||
func TestHandleMessage_PublishErrorAdvancesOffset(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, "healthy"), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).Return(nil)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, "healthy"), "probe_failed"), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).Return(errors.New("redis down"))
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, "probe_failed", h.now))
|
||||
assert.True(t, advance)
|
||||
}
|
||||
|
||||
// TestHandleMessage_AllEventTypes_RouteSummaries asserts the event-type
|
||||
// → summary mapping for the four non-CAS event types, plus that
|
||||
// container_started is non-CAS too. The CAS variants are covered by
|
||||
// dedicated tests above.
|
||||
func TestHandleMessage_AllEventTypes_RouteSummaries(t *testing.T) {
|
||||
type expectation struct {
|
||||
eventType string
|
||||
wantSummary string
|
||||
wantsCASCall bool
|
||||
}
|
||||
cases := []expectation{
|
||||
{"container_started", "healthy", false},
|
||||
{"probe_failed", "probe_failed", false},
|
||||
{"inspect_unhealthy", "inspect_unhealthy", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.eventType, func(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
gameID := "game-001"
|
||||
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(runningRecord(gameID, ""), nil)
|
||||
h.store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, input ports.UpdateEngineHealthInput) error {
|
||||
assert.Equal(t, tc.wantSummary, input.EngineHealthSummary)
|
||||
return nil
|
||||
},
|
||||
)
|
||||
h.store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(runningRecord(gameID, ""), tc.wantSummary), nil)
|
||||
h.publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).Return(nil)
|
||||
|
||||
advance := h.worker.HandleMessage(context.Background(), xMessage("0-1", gameID, tc.eventType, h.now))
|
||||
assert.True(t, advance)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRun_LoadsOffsetAndAdvances drives a real XREAD loop against a
|
||||
// miniredis instance. After XADD-ing one entry and observing the loop
|
||||
// exit on context cancellation, the persisted offset must equal the
|
||||
// consumed entry's ID.
|
||||
func TestRun_LoadsOffsetAndAdvances(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
store := mocks.NewMockRuntimeRecordStore(ctrl)
|
||||
publisher := mocks.NewMockLobbyEventsPublisher(ctrl)
|
||||
telem := newTestTelemetry(t)
|
||||
|
||||
gameID := "game-001"
|
||||
rec := runningRecord(gameID, "healthy")
|
||||
|
||||
var (
|
||||
mu sync.Mutex
|
||||
offset string
|
||||
offsetSet bool
|
||||
)
|
||||
offsetStore := mocks.NewMockStreamOffsetStore(ctrl)
|
||||
offsetStore.EXPECT().Load(gomock.Any(), testLabel).Return("", false, nil)
|
||||
offsetStore.EXPECT().Save(gomock.Any(), testLabel, gomock.Any()).DoAndReturn(
|
||||
func(_ context.Context, _ string, entryID string) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
offset = entryID
|
||||
offsetSet = true
|
||||
return nil
|
||||
},
|
||||
).MinTimes(1)
|
||||
|
||||
store.EXPECT().Get(gomock.Any(), gameID).Return(rec, nil)
|
||||
store.EXPECT().UpdateEngineHealth(gomock.Any(), gomock.Any()).Return(nil)
|
||||
store.EXPECT().Get(gomock.Any(), gameID).Return(withSummary(rec, "probe_failed"), nil)
|
||||
publisher.EXPECT().PublishSnapshotUpdate(gomock.Any(), gomock.Any()).Return(nil)
|
||||
|
||||
worker, err := healtheventsconsumer.NewWorker(healtheventsconsumer.Dependencies{
|
||||
Client: client,
|
||||
Stream: testStream,
|
||||
StreamLabel: testLabel,
|
||||
BlockTimeout: 100 * time.Millisecond,
|
||||
OffsetStore: offsetStore,
|
||||
RuntimeRecords: store,
|
||||
LobbyEvents: publisher,
|
||||
Telemetry: telem,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
occurredMS := strconv.FormatInt(time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC).UnixMilli(), 10)
|
||||
entryID, err := client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: testStream,
|
||||
Values: map[string]any{
|
||||
"game_id": gameID,
|
||||
"event_type": "probe_failed",
|
||||
"occurred_at_ms": occurredMS,
|
||||
"details": "{}",
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- worker.Run(ctx) }()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
set := offsetSet
|
||||
mu.Unlock()
|
||||
if set {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.True(t, errors.Is(err, context.Canceled), "run must exit with context.Canceled, got %v", err)
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("worker did not exit within deadline")
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
require.True(t, offsetSet, "offset must be persisted at least once")
|
||||
assert.Equal(t, entryID, offset)
|
||||
}
|
||||
|
||||
// TestRun_ContextCancel — Run returns context.Canceled on cancel even
|
||||
// when no stream entry is available.
|
||||
func TestRun_ContextCancel(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
store := mocks.NewMockRuntimeRecordStore(ctrl)
|
||||
publisher := mocks.NewMockLobbyEventsPublisher(ctrl)
|
||||
offsetStore := mocks.NewMockStreamOffsetStore(ctrl)
|
||||
offsetStore.EXPECT().Load(gomock.Any(), testLabel).Return("0-0", true, nil)
|
||||
|
||||
worker, err := healtheventsconsumer.NewWorker(healtheventsconsumer.Dependencies{
|
||||
Client: client,
|
||||
Stream: testStream,
|
||||
StreamLabel: testLabel,
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
OffsetStore: offsetStore,
|
||||
RuntimeRecords: store,
|
||||
LobbyEvents: publisher,
|
||||
Telemetry: newTestTelemetry(t),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- worker.Run(ctx) }()
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.True(t, errors.Is(err, context.Canceled), "want context.Canceled, got %v", err)
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("worker did not exit within deadline")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRun_FailsOnOffsetLoadError covers the bootstrap failure: a load
|
||||
// error is fatal and surfaces from Run.
|
||||
func TestRun_FailsOnOffsetLoadError(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
offsetStore := mocks.NewMockStreamOffsetStore(ctrl)
|
||||
offsetStore.EXPECT().Load(gomock.Any(), testLabel).Return("", false, errors.New("redis down"))
|
||||
|
||||
worker, err := healtheventsconsumer.NewWorker(healtheventsconsumer.Dependencies{
|
||||
Client: client,
|
||||
Stream: testStream,
|
||||
StreamLabel: testLabel,
|
||||
BlockTimeout: 50 * time.Millisecond,
|
||||
OffsetStore: offsetStore,
|
||||
RuntimeRecords: mocks.NewMockRuntimeRecordStore(ctrl),
|
||||
LobbyEvents: mocks.NewMockLobbyEventsPublisher(ctrl),
|
||||
Telemetry: newTestTelemetry(t),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
err = worker.Run(context.Background())
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "load offset")
|
||||
}
|
||||
|
||||
// TestShutdown_Noop confirms Shutdown returns nil for a non-nil ctx
|
||||
// and rejects a nil one.
|
||||
func TestShutdown_Noop(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
h := newHarness(t, ctrl)
|
||||
require.NoError(t, h.worker.Shutdown(context.Background()))
|
||||
|
||||
//nolint:staticcheck // Deliberate nil context to verify guard.
|
||||
require.Error(t, h.worker.Shutdown(nil))
|
||||
}
|
||||
Reference in New Issue
Block a user