// Package healtheventsconsumer implements the worker that consumes // `runtime:health_events` from Runtime Manager and propagates engine // health observations into Game Master state. // // On every consumed entry the worker: // // 1. Updates `runtime_records.engine_health` per game with a short // summary string (`healthy`, `probe_failed`, `inspect_unhealthy`, // `exited`, `oom`, `disappeared`). // 2. For terminal container events (`container_exited`, // `container_oom`, `container_disappeared`) attempts a // compare-and-swap `running → engine_unreachable`. For // `probe_recovered` attempts the symmetric recovery CAS // `engine_unreachable → running`. Both transitions are pre-declared // in `domain/runtime/transitions.go`. CAS conflicts (record not in // the expected source status) fall back to a health-only update so // the summary stays current even when another flow (turn // generation, admin op) holds the status. // 3. Publishes a `runtime_snapshot_update` on `gm:lobby_events` only // when the status transitioned or when the engine-health summary // differs from the previously emitted one for the same game. The // last-emitted summary is tracked in process memory; on restart // the cache is empty and the first event per game produces one // snapshot. // // The XREAD loop, offset handling, and shutdown semantics mirror the // Lobby `gmevents` consumer at `lobby/internal/worker/gmevents`. package healtheventsconsumer import ( "context" "errors" "fmt" "log/slog" "strconv" "strings" "sync" "time" "galaxy/gamemaster/internal/domain/runtime" "galaxy/gamemaster/internal/logging" "galaxy/gamemaster/internal/ports" "galaxy/gamemaster/internal/telemetry" "github.com/redis/go-redis/v9" ) // Wire field names on the `runtime:health_events` Redis Stream entry, // fixed by `rtmanager/api/runtime-health-asyncapi.yaml`. Renaming any // of them breaks the contract. const ( fieldGameID = "game_id" fieldEventType = "event_type" fieldOccurredAtMS = "occurred_at_ms" ) // RTM event-type values per // `rtmanager/internal/domain/health/snapshot.go`. Stage 18 maps all // seven (the PLAN enumerates six; container_started and // probe_recovered are added here). const ( eventTypeContainerStarted = "container_started" eventTypeProbeRecovered = "probe_recovered" eventTypeProbeFailed = "probe_failed" eventTypeInspectUnhealthy = "inspect_unhealthy" eventTypeContainerExited = "container_exited" eventTypeContainerOOM = "container_oom" eventTypeContainerDisappeared = "container_disappeared" ) // engine_health summary strings written to `runtime_records.engine_health`. const ( summaryHealthy = "healthy" summaryProbeFailed = "probe_failed" summaryInspectUnhealthy = "inspect_unhealthy" summaryExited = "exited" summaryOOM = "oom" summaryDisappeared = "disappeared" ) // snapshotEventType is the discriminator written by // `LobbyEventsPublisher.PublishSnapshotUpdate` and recorded on the // `gamemaster.lobby_events.published` counter. const snapshotEventType = "runtime_snapshot_update" // Dependencies groups the collaborators required by Worker. type Dependencies struct { // Client provides XREAD access to the runtime:health_events stream. Client *redis.Client // Stream stores the Redis Streams key consumed by the worker // (typically `runtime:health_events`). Stream string // StreamLabel identifies the consumer in the stream-offset store. // Defaults to `health_events` when empty. StreamLabel string // BlockTimeout bounds the blocking XREAD window. Required positive. BlockTimeout time.Duration // OffsetStore persists the last successfully processed entry id. OffsetStore ports.StreamOffsetStore // RuntimeRecords is mutated on every observation. RuntimeRecords ports.RuntimeRecordStore // LobbyEvents publishes the debounced `runtime_snapshot_update` // messages that propagate health summary changes to Game Lobby. LobbyEvents ports.LobbyEventsPublisher // Telemetry receives one consumed-event count per processed entry // and one published-event count per emitted snapshot. Required. Telemetry *telemetry.Runtime // Clock supplies the wall-clock used for store updates and for // `RuntimeSnapshotUpdate.OccurredAt`. Defaults to `time.Now` when // nil. Clock func() time.Time // Logger receives structured worker-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger } // defaultStreamLabel is used when Dependencies.StreamLabel is empty. const defaultStreamLabel = "health_events" // Worker drives the runtime:health_events processing loop. type Worker struct { client *redis.Client stream string streamLabel string blockTimeout time.Duration offsetStore ports.StreamOffsetStore runtimeRecords ports.RuntimeRecordStore lobbyEvents ports.LobbyEventsPublisher telemetry *telemetry.Runtime clock func() time.Time logger *slog.Logger mu sync.RWMutex lastEmittedSummary map[string]string } // NewWorker constructs one Worker from deps. func NewWorker(deps Dependencies) (*Worker, error) { switch { case deps.Client == nil: return nil, errors.New("new health events consumer: nil redis client") case strings.TrimSpace(deps.Stream) == "": return nil, errors.New("new health events consumer: stream must not be empty") case deps.BlockTimeout <= 0: return nil, errors.New("new health events consumer: block timeout must be positive") case deps.OffsetStore == nil: return nil, errors.New("new health events consumer: nil offset store") case deps.RuntimeRecords == nil: return nil, errors.New("new health events consumer: nil runtime records store") case deps.LobbyEvents == nil: return nil, errors.New("new health events consumer: nil lobby events publisher") case deps.Telemetry == nil: return nil, errors.New("new health events consumer: nil telemetry runtime") } streamLabel := strings.TrimSpace(deps.StreamLabel) if streamLabel == "" { streamLabel = defaultStreamLabel } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } return &Worker{ client: deps.Client, stream: deps.Stream, streamLabel: streamLabel, blockTimeout: deps.BlockTimeout, offsetStore: deps.OffsetStore, runtimeRecords: deps.RuntimeRecords, lobbyEvents: deps.LobbyEvents, telemetry: deps.Telemetry, clock: clock, logger: logger.With("worker", "gamemaster.healtheventsconsumer", "stream", deps.Stream), lastEmittedSummary: make(map[string]string), }, nil } // Run drives the XREAD loop until ctx is cancelled. The offset advances // only after a successful HandleMessage call. The loop exits on context // cancellation or a fatal Redis error. func (worker *Worker) Run(ctx context.Context) error { if worker == nil { return errors.New("run health events consumer: nil worker") } if ctx == nil { return errors.New("run health events consumer: nil context") } if err := ctx.Err(); err != nil { return err } lastID, found, err := worker.offsetStore.Load(ctx, worker.streamLabel) if err != nil { return fmt.Errorf("run health events consumer: load offset: %w", err) } if !found { lastID = "0-0" } worker.logger.Info("health events consumer started", "block_timeout", worker.blockTimeout.String(), "start_entry_id", lastID, ) defer worker.logger.Info("health events consumer stopped") for { streams, err := worker.client.XRead(ctx, &redis.XReadArgs{ Streams: []string{worker.stream, lastID}, Count: 1, Block: worker.blockTimeout, }).Result() switch { case err == nil: for _, stream := range streams { for _, message := range stream.Messages { if !worker.HandleMessage(ctx, message) { continue } if err := worker.offsetStore.Save(ctx, worker.streamLabel, message.ID); err != nil { return fmt.Errorf("run health events consumer: save offset: %w", err) } lastID = message.ID } } case errors.Is(err, redis.Nil): continue case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)): return ctx.Err() case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed): return fmt.Errorf("run health events consumer: %w", err) default: return fmt.Errorf("run health events consumer: %w", err) } } } // Shutdown is a no-op; the worker relies on context cancellation. func (worker *Worker) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown health events consumer: nil context") } return nil } // HandleMessage processes one Redis Stream entry and reports whether // the offset is allowed to advance. Decode errors and orphan game ids // return true so the offset advances past the entry; only fatal store // or publisher failures return false (currently never — every error is // logged and absorbed, the offset always advances after the entry has // been observed). // // Exported so tests can drive the worker deterministically without // spinning up a real XREAD loop. func (worker *Worker) HandleMessage(ctx context.Context, message redis.XMessage) bool { if worker == nil { return false } event, err := decodeEvent(message) if err != nil { worker.logger.WarnContext(ctx, "decode runtime health event", "stream_entry_id", message.ID, "err", err.Error(), ) worker.telemetry.RecordHealthEventConsumed(ctx) return true } plan, ok := planFor(event.EventType) if !ok { worker.logger.WarnContext(ctx, "unknown runtime health event type", "stream_entry_id", message.ID, "game_id", event.GameID, "event_type", event.EventType, ) worker.telemetry.RecordHealthEventConsumed(ctx) return true } now := worker.clock().UTC() current, err := worker.runtimeRecords.Get(ctx, event.GameID) if err != nil { if errors.Is(err, runtime.ErrNotFound) { worker.logger.WarnContext(ctx, "runtime health event for unknown game", "stream_entry_id", message.ID, "game_id", event.GameID, "event_type", event.EventType, ) worker.telemetry.RecordHealthEventConsumed(ctx) return true } worker.logger.WarnContext(ctx, "load runtime record for health event", "stream_entry_id", message.ID, "game_id", event.GameID, "err", err.Error(), ) worker.telemetry.RecordHealthEventConsumed(ctx) return true } statusChanged := worker.applyMutation(ctx, message.ID, current, plan, now) if !worker.shouldPublish(event.GameID, plan.summary, statusChanged) { worker.telemetry.RecordHealthEventConsumed(ctx) return true } refreshed, err := worker.runtimeRecords.Get(ctx, event.GameID) if err != nil { worker.logger.WarnContext(ctx, "reload runtime record for snapshot", "stream_entry_id", message.ID, "game_id", event.GameID, "err", err.Error(), ) worker.telemetry.RecordHealthEventConsumed(ctx) return true } snapshot := ports.RuntimeSnapshotUpdate{ GameID: refreshed.GameID, CurrentTurn: refreshed.CurrentTurn, RuntimeStatus: refreshed.Status, EngineHealthSummary: refreshed.EngineHealth, PlayerTurnStats: nil, OccurredAt: now, } if err := worker.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil { logArgs := []any{ "stream_entry_id", message.ID, "game_id", event.GameID, "err", err.Error(), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) worker.logger.WarnContext(ctx, "publish runtime snapshot update", logArgs...) worker.telemetry.RecordHealthEventConsumed(ctx) return true } worker.telemetry.RecordLobbyEventPublished(ctx, snapshotEventType) worker.rememberSummary(event.GameID, plan.summary) worker.telemetry.RecordHealthEventConsumed(ctx) return true } // applyMutation applies the plan to the runtime record. When plan.transition // is set, the worker first attempts a CAS UpdateStatus from the expected // source status; on conflict or invalid-transition it falls back to a // health-only UpdateEngineHealth. When plan.transition is nil only // UpdateEngineHealth runs. Returns true when the status was actually // transitioned. func (worker *Worker) applyMutation( ctx context.Context, entryID string, current runtime.RuntimeRecord, plan eventPlan, now time.Time, ) bool { if plan.transition != nil { summary := plan.summary err := worker.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: current.GameID, ExpectedFrom: plan.transition.from, To: plan.transition.to, Now: now, EngineHealthSummary: &summary, }) switch { case err == nil: worker.logger.InfoContext(ctx, "runtime status transitioned by health event", "stream_entry_id", entryID, "game_id", current.GameID, "from_status", string(plan.transition.from), "to_status", string(plan.transition.to), "engine_health", plan.summary, ) return true case errors.Is(err, runtime.ErrConflict), errors.Is(err, runtime.ErrInvalidTransition): worker.logger.DebugContext(ctx, "runtime status CAS conflict, falling back to health-only update", "stream_entry_id", entryID, "game_id", current.GameID, "current_status", string(current.Status), "expected_from", string(plan.transition.from), "engine_health", plan.summary, ) default: worker.logger.WarnContext(ctx, "update runtime status from health event", "stream_entry_id", entryID, "game_id", current.GameID, "err", err.Error(), ) return false } } if err := worker.runtimeRecords.UpdateEngineHealth(ctx, ports.UpdateEngineHealthInput{ GameID: current.GameID, EngineHealthSummary: plan.summary, Now: now, }); err != nil && !errors.Is(err, runtime.ErrNotFound) { worker.logger.WarnContext(ctx, "update runtime engine health", "stream_entry_id", entryID, "game_id", current.GameID, "err", err.Error(), ) } return false } // shouldPublish returns whether a snapshot must be emitted: either the // status changed in this iteration, or the engine_health summary // differs from the last summary published for this game. func (worker *Worker) shouldPublish(gameID, summary string, statusChanged bool) bool { if statusChanged { return true } worker.mu.RLock() last, ok := worker.lastEmittedSummary[gameID] worker.mu.RUnlock() if !ok { return true } return last != summary } // rememberSummary stores the latest published summary for gameID. func (worker *Worker) rememberSummary(gameID, summary string) { worker.mu.Lock() worker.lastEmittedSummary[gameID] = summary worker.mu.Unlock() } // healthEvent stores the decoded XADD entry shared across handlers. type healthEvent struct { GameID string EventType string OccurredAt time.Time } // decodeEvent parses a Redis Stream message into a healthEvent. Missing // or malformed required fields produce an error. func decodeEvent(message redis.XMessage) (healthEvent, error) { gameID := optionalString(message.Values, fieldGameID) if strings.TrimSpace(gameID) == "" { return healthEvent{}, errors.New("missing game_id") } eventType := optionalString(message.Values, fieldEventType) if strings.TrimSpace(eventType) == "" { return healthEvent{}, errors.New("missing event_type") } occurredAtMSRaw := optionalString(message.Values, fieldOccurredAtMS) if strings.TrimSpace(occurredAtMSRaw) == "" { return healthEvent{}, errors.New("missing occurred_at_ms") } occurredAtMS, err := strconv.ParseInt(occurredAtMSRaw, 10, 64) if err != nil { return healthEvent{}, fmt.Errorf("invalid occurred_at_ms: %w", err) } if occurredAtMS <= 0 { return healthEvent{}, errors.New("invalid occurred_at_ms: must be positive") } return healthEvent{ GameID: gameID, EventType: eventType, OccurredAt: time.UnixMilli(occurredAtMS).UTC(), }, nil } // transitionPlan encodes one allowed CAS pair. nil-transition events // only update the summary. type transitionPlan struct { from runtime.Status to runtime.Status } // eventPlan is the decoded reaction to one event_type. type eventPlan struct { summary string transition *transitionPlan } // planFor returns the eventPlan registered for eventType. The boolean // reports whether the type is recognised. func planFor(eventType string) (eventPlan, bool) { switch eventType { case eventTypeContainerStarted: return eventPlan{summary: summaryHealthy}, true case eventTypeProbeRecovered: return eventPlan{ summary: summaryHealthy, transition: &transitionPlan{ from: runtime.StatusEngineUnreachable, to: runtime.StatusRunning, }, }, true case eventTypeProbeFailed: return eventPlan{summary: summaryProbeFailed}, true case eventTypeInspectUnhealthy: return eventPlan{summary: summaryInspectUnhealthy}, true case eventTypeContainerExited: return eventPlan{ summary: summaryExited, transition: &transitionPlan{ from: runtime.StatusRunning, to: runtime.StatusEngineUnreachable, }, }, true case eventTypeContainerOOM: return eventPlan{ summary: summaryOOM, transition: &transitionPlan{ from: runtime.StatusRunning, to: runtime.StatusEngineUnreachable, }, }, true case eventTypeContainerDisappeared: return eventPlan{ summary: summaryDisappeared, transition: &transitionPlan{ from: runtime.StatusRunning, to: runtime.StatusEngineUnreachable, }, }, true default: return eventPlan{}, false } } func optionalString(values map[string]any, key string) string { raw, ok := values[key] if !ok { return "" } switch typed := raw.(type) { case string: return typed case []byte: return string(typed) default: return "" } }