557 lines
18 KiB
Go
557 lines
18 KiB
Go
// Package healtheventsconsumer implements the worker that consumes
|
|
// `runtime:health_events` from Runtime Manager and propagates engine
|
|
// health observations into Game Master state.
|
|
//
|
|
// On every consumed entry the worker:
|
|
//
|
|
// 1. Updates `runtime_records.engine_health` per game with a short
|
|
// summary string (`healthy`, `probe_failed`, `inspect_unhealthy`,
|
|
// `exited`, `oom`, `disappeared`).
|
|
// 2. For terminal container events (`container_exited`,
|
|
// `container_oom`, `container_disappeared`) attempts a
|
|
// compare-and-swap `running → engine_unreachable`. For
|
|
// `probe_recovered` attempts the symmetric recovery CAS
|
|
// `engine_unreachable → running`. Both transitions are pre-declared
|
|
// in `domain/runtime/transitions.go`. CAS conflicts (record not in
|
|
// the expected source status) fall back to a health-only update so
|
|
// the summary stays current even when another flow (turn
|
|
// generation, admin op) holds the status.
|
|
// 3. Publishes a `runtime_snapshot_update` on `gm:lobby_events` only
|
|
// when the status transitioned or when the engine-health summary
|
|
// differs from the previously emitted one for the same game. The
|
|
// last-emitted summary is tracked in process memory; on restart
|
|
// the cache is empty and the first event per game produces one
|
|
// snapshot.
|
|
//
|
|
// The XREAD loop, offset handling, and shutdown semantics mirror the
|
|
// Lobby `gmevents` consumer at `lobby/internal/worker/gmevents`.
|
|
package healtheventsconsumer
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"galaxy/gamemaster/internal/domain/runtime"
|
|
"galaxy/gamemaster/internal/logging"
|
|
"galaxy/gamemaster/internal/ports"
|
|
"galaxy/gamemaster/internal/telemetry"
|
|
|
|
"github.com/redis/go-redis/v9"
|
|
)
|
|
|
|
// Wire field names on the `runtime:health_events` Redis Stream entry,
|
|
// fixed by `rtmanager/api/runtime-health-asyncapi.yaml`. Renaming any
|
|
// of them breaks the contract.
|
|
const (
|
|
fieldGameID = "game_id"
|
|
fieldEventType = "event_type"
|
|
fieldOccurredAtMS = "occurred_at_ms"
|
|
)
|
|
|
|
// RTM event-type values per
|
|
// `rtmanager/internal/domain/health/snapshot.go`. Stage 18 maps all
|
|
// seven (the PLAN enumerates six; container_started and
|
|
// probe_recovered are added here).
|
|
const (
|
|
eventTypeContainerStarted = "container_started"
|
|
eventTypeProbeRecovered = "probe_recovered"
|
|
eventTypeProbeFailed = "probe_failed"
|
|
eventTypeInspectUnhealthy = "inspect_unhealthy"
|
|
eventTypeContainerExited = "container_exited"
|
|
eventTypeContainerOOM = "container_oom"
|
|
eventTypeContainerDisappeared = "container_disappeared"
|
|
)
|
|
|
|
// engine_health summary strings written to `runtime_records.engine_health`.
|
|
const (
|
|
summaryHealthy = "healthy"
|
|
summaryProbeFailed = "probe_failed"
|
|
summaryInspectUnhealthy = "inspect_unhealthy"
|
|
summaryExited = "exited"
|
|
summaryOOM = "oom"
|
|
summaryDisappeared = "disappeared"
|
|
)
|
|
|
|
// snapshotEventType is the discriminator written by
|
|
// `LobbyEventsPublisher.PublishSnapshotUpdate` and recorded on the
|
|
// `gamemaster.lobby_events.published` counter.
|
|
const snapshotEventType = "runtime_snapshot_update"
|
|
|
|
// Dependencies groups the collaborators required by Worker.
|
|
type Dependencies struct {
|
|
// Client provides XREAD access to the runtime:health_events stream.
|
|
Client *redis.Client
|
|
|
|
// Stream stores the Redis Streams key consumed by the worker
|
|
// (typically `runtime:health_events`).
|
|
Stream string
|
|
|
|
// StreamLabel identifies the consumer in the stream-offset store.
|
|
// Defaults to `health_events` when empty.
|
|
StreamLabel string
|
|
|
|
// BlockTimeout bounds the blocking XREAD window. Required positive.
|
|
BlockTimeout time.Duration
|
|
|
|
// OffsetStore persists the last successfully processed entry id.
|
|
OffsetStore ports.StreamOffsetStore
|
|
|
|
// RuntimeRecords is mutated on every observation.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// LobbyEvents publishes the debounced `runtime_snapshot_update`
|
|
// messages that propagate health summary changes to Game Lobby.
|
|
LobbyEvents ports.LobbyEventsPublisher
|
|
|
|
// Telemetry receives one consumed-event count per processed entry
|
|
// and one published-event count per emitted snapshot. Required.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Clock supplies the wall-clock used for store updates and for
|
|
// `RuntimeSnapshotUpdate.OccurredAt`. Defaults to `time.Now` when
|
|
// nil.
|
|
Clock func() time.Time
|
|
|
|
// Logger receives structured worker-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// defaultStreamLabel is used when Dependencies.StreamLabel is empty.
|
|
const defaultStreamLabel = "health_events"
|
|
|
|
// Worker drives the runtime:health_events processing loop.
|
|
type Worker struct {
|
|
client *redis.Client
|
|
stream string
|
|
streamLabel string
|
|
blockTimeout time.Duration
|
|
offsetStore ports.StreamOffsetStore
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
lobbyEvents ports.LobbyEventsPublisher
|
|
telemetry *telemetry.Runtime
|
|
clock func() time.Time
|
|
logger *slog.Logger
|
|
|
|
mu sync.RWMutex
|
|
lastEmittedSummary map[string]string
|
|
}
|
|
|
|
// NewWorker constructs one Worker from deps.
|
|
func NewWorker(deps Dependencies) (*Worker, error) {
|
|
switch {
|
|
case deps.Client == nil:
|
|
return nil, errors.New("new health events consumer: nil redis client")
|
|
case strings.TrimSpace(deps.Stream) == "":
|
|
return nil, errors.New("new health events consumer: stream must not be empty")
|
|
case deps.BlockTimeout <= 0:
|
|
return nil, errors.New("new health events consumer: block timeout must be positive")
|
|
case deps.OffsetStore == nil:
|
|
return nil, errors.New("new health events consumer: nil offset store")
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new health events consumer: nil runtime records store")
|
|
case deps.LobbyEvents == nil:
|
|
return nil, errors.New("new health events consumer: nil lobby events publisher")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new health events consumer: nil telemetry runtime")
|
|
}
|
|
|
|
streamLabel := strings.TrimSpace(deps.StreamLabel)
|
|
if streamLabel == "" {
|
|
streamLabel = defaultStreamLabel
|
|
}
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
return &Worker{
|
|
client: deps.Client,
|
|
stream: deps.Stream,
|
|
streamLabel: streamLabel,
|
|
blockTimeout: deps.BlockTimeout,
|
|
offsetStore: deps.OffsetStore,
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
lobbyEvents: deps.LobbyEvents,
|
|
telemetry: deps.Telemetry,
|
|
clock: clock,
|
|
logger: logger.With("worker", "gamemaster.healtheventsconsumer", "stream", deps.Stream),
|
|
lastEmittedSummary: make(map[string]string),
|
|
}, nil
|
|
}
|
|
|
|
// Run drives the XREAD loop until ctx is cancelled. The offset advances
|
|
// only after a successful HandleMessage call. The loop exits on context
|
|
// cancellation or a fatal Redis error.
|
|
func (worker *Worker) Run(ctx context.Context) error {
|
|
if worker == nil {
|
|
return errors.New("run health events consumer: nil worker")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run health events consumer: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
lastID, found, err := worker.offsetStore.Load(ctx, worker.streamLabel)
|
|
if err != nil {
|
|
return fmt.Errorf("run health events consumer: load offset: %w", err)
|
|
}
|
|
if !found {
|
|
lastID = "0-0"
|
|
}
|
|
|
|
worker.logger.Info("health events consumer started",
|
|
"block_timeout", worker.blockTimeout.String(),
|
|
"start_entry_id", lastID,
|
|
)
|
|
defer worker.logger.Info("health events consumer stopped")
|
|
|
|
for {
|
|
streams, err := worker.client.XRead(ctx, &redis.XReadArgs{
|
|
Streams: []string{worker.stream, lastID},
|
|
Count: 1,
|
|
Block: worker.blockTimeout,
|
|
}).Result()
|
|
switch {
|
|
case err == nil:
|
|
for _, stream := range streams {
|
|
for _, message := range stream.Messages {
|
|
if !worker.HandleMessage(ctx, message) {
|
|
continue
|
|
}
|
|
if err := worker.offsetStore.Save(ctx, worker.streamLabel, message.ID); err != nil {
|
|
return fmt.Errorf("run health events consumer: save offset: %w", err)
|
|
}
|
|
lastID = message.ID
|
|
}
|
|
}
|
|
case errors.Is(err, redis.Nil):
|
|
continue
|
|
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
|
return ctx.Err()
|
|
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
|
return fmt.Errorf("run health events consumer: %w", err)
|
|
default:
|
|
return fmt.Errorf("run health events consumer: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; the worker relies on context cancellation.
|
|
func (worker *Worker) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown health events consumer: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// HandleMessage processes one Redis Stream entry and reports whether
|
|
// the offset is allowed to advance. Decode errors and orphan game ids
|
|
// return true so the offset advances past the entry; only fatal store
|
|
// or publisher failures return false (currently never — every error is
|
|
// logged and absorbed, the offset always advances after the entry has
|
|
// been observed).
|
|
//
|
|
// Exported so tests can drive the worker deterministically without
|
|
// spinning up a real XREAD loop.
|
|
func (worker *Worker) HandleMessage(ctx context.Context, message redis.XMessage) bool {
|
|
if worker == nil {
|
|
return false
|
|
}
|
|
|
|
event, err := decodeEvent(message)
|
|
if err != nil {
|
|
worker.logger.WarnContext(ctx, "decode runtime health event",
|
|
"stream_entry_id", message.ID,
|
|
"err", err.Error(),
|
|
)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
plan, ok := planFor(event.EventType)
|
|
if !ok {
|
|
worker.logger.WarnContext(ctx, "unknown runtime health event type",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", event.GameID,
|
|
"event_type", event.EventType,
|
|
)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
now := worker.clock().UTC()
|
|
|
|
current, err := worker.runtimeRecords.Get(ctx, event.GameID)
|
|
if err != nil {
|
|
if errors.Is(err, runtime.ErrNotFound) {
|
|
worker.logger.WarnContext(ctx, "runtime health event for unknown game",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", event.GameID,
|
|
"event_type", event.EventType,
|
|
)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
worker.logger.WarnContext(ctx, "load runtime record for health event",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", event.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
statusChanged := worker.applyMutation(ctx, message.ID, current, plan, now)
|
|
|
|
if !worker.shouldPublish(event.GameID, plan.summary, statusChanged) {
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
refreshed, err := worker.runtimeRecords.Get(ctx, event.GameID)
|
|
if err != nil {
|
|
worker.logger.WarnContext(ctx, "reload runtime record for snapshot",
|
|
"stream_entry_id", message.ID,
|
|
"game_id", event.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
snapshot := ports.RuntimeSnapshotUpdate{
|
|
GameID: refreshed.GameID,
|
|
CurrentTurn: refreshed.CurrentTurn,
|
|
RuntimeStatus: refreshed.Status,
|
|
EngineHealthSummary: refreshed.EngineHealth,
|
|
PlayerTurnStats: nil,
|
|
OccurredAt: now,
|
|
}
|
|
if err := worker.lobbyEvents.PublishSnapshotUpdate(ctx, snapshot); err != nil {
|
|
logArgs := []any{
|
|
"stream_entry_id", message.ID,
|
|
"game_id", event.GameID,
|
|
"err", err.Error(),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
worker.logger.WarnContext(ctx, "publish runtime snapshot update", logArgs...)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
worker.telemetry.RecordLobbyEventPublished(ctx, snapshotEventType)
|
|
worker.rememberSummary(event.GameID, plan.summary)
|
|
worker.telemetry.RecordHealthEventConsumed(ctx)
|
|
return true
|
|
}
|
|
|
|
// applyMutation applies the plan to the runtime record. When plan.transition
|
|
// is set, the worker first attempts a CAS UpdateStatus from the expected
|
|
// source status; on conflict or invalid-transition it falls back to a
|
|
// health-only UpdateEngineHealth. When plan.transition is nil only
|
|
// UpdateEngineHealth runs. Returns true when the status was actually
|
|
// transitioned.
|
|
func (worker *Worker) applyMutation(
|
|
ctx context.Context,
|
|
entryID string,
|
|
current runtime.RuntimeRecord,
|
|
plan eventPlan,
|
|
now time.Time,
|
|
) bool {
|
|
if plan.transition != nil {
|
|
summary := plan.summary
|
|
err := worker.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
|
GameID: current.GameID,
|
|
ExpectedFrom: plan.transition.from,
|
|
To: plan.transition.to,
|
|
Now: now,
|
|
EngineHealthSummary: &summary,
|
|
})
|
|
switch {
|
|
case err == nil:
|
|
worker.logger.InfoContext(ctx, "runtime status transitioned by health event",
|
|
"stream_entry_id", entryID,
|
|
"game_id", current.GameID,
|
|
"from_status", string(plan.transition.from),
|
|
"to_status", string(plan.transition.to),
|
|
"engine_health", plan.summary,
|
|
)
|
|
return true
|
|
case errors.Is(err, runtime.ErrConflict), errors.Is(err, runtime.ErrInvalidTransition):
|
|
worker.logger.DebugContext(ctx, "runtime status CAS conflict, falling back to health-only update",
|
|
"stream_entry_id", entryID,
|
|
"game_id", current.GameID,
|
|
"current_status", string(current.Status),
|
|
"expected_from", string(plan.transition.from),
|
|
"engine_health", plan.summary,
|
|
)
|
|
default:
|
|
worker.logger.WarnContext(ctx, "update runtime status from health event",
|
|
"stream_entry_id", entryID,
|
|
"game_id", current.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
return false
|
|
}
|
|
}
|
|
|
|
if err := worker.runtimeRecords.UpdateEngineHealth(ctx, ports.UpdateEngineHealthInput{
|
|
GameID: current.GameID,
|
|
EngineHealthSummary: plan.summary,
|
|
Now: now,
|
|
}); err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
|
worker.logger.WarnContext(ctx, "update runtime engine health",
|
|
"stream_entry_id", entryID,
|
|
"game_id", current.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
}
|
|
return false
|
|
}
|
|
|
|
// shouldPublish returns whether a snapshot must be emitted: either the
|
|
// status changed in this iteration, or the engine_health summary
|
|
// differs from the last summary published for this game.
|
|
func (worker *Worker) shouldPublish(gameID, summary string, statusChanged bool) bool {
|
|
if statusChanged {
|
|
return true
|
|
}
|
|
worker.mu.RLock()
|
|
last, ok := worker.lastEmittedSummary[gameID]
|
|
worker.mu.RUnlock()
|
|
if !ok {
|
|
return true
|
|
}
|
|
return last != summary
|
|
}
|
|
|
|
// rememberSummary stores the latest published summary for gameID.
|
|
func (worker *Worker) rememberSummary(gameID, summary string) {
|
|
worker.mu.Lock()
|
|
worker.lastEmittedSummary[gameID] = summary
|
|
worker.mu.Unlock()
|
|
}
|
|
|
|
// healthEvent stores the decoded XADD entry shared across handlers.
|
|
type healthEvent struct {
|
|
GameID string
|
|
EventType string
|
|
OccurredAt time.Time
|
|
}
|
|
|
|
// decodeEvent parses a Redis Stream message into a healthEvent. Missing
|
|
// or malformed required fields produce an error.
|
|
func decodeEvent(message redis.XMessage) (healthEvent, error) {
|
|
gameID := optionalString(message.Values, fieldGameID)
|
|
if strings.TrimSpace(gameID) == "" {
|
|
return healthEvent{}, errors.New("missing game_id")
|
|
}
|
|
eventType := optionalString(message.Values, fieldEventType)
|
|
if strings.TrimSpace(eventType) == "" {
|
|
return healthEvent{}, errors.New("missing event_type")
|
|
}
|
|
occurredAtMSRaw := optionalString(message.Values, fieldOccurredAtMS)
|
|
if strings.TrimSpace(occurredAtMSRaw) == "" {
|
|
return healthEvent{}, errors.New("missing occurred_at_ms")
|
|
}
|
|
occurredAtMS, err := strconv.ParseInt(occurredAtMSRaw, 10, 64)
|
|
if err != nil {
|
|
return healthEvent{}, fmt.Errorf("invalid occurred_at_ms: %w", err)
|
|
}
|
|
if occurredAtMS <= 0 {
|
|
return healthEvent{}, errors.New("invalid occurred_at_ms: must be positive")
|
|
}
|
|
return healthEvent{
|
|
GameID: gameID,
|
|
EventType: eventType,
|
|
OccurredAt: time.UnixMilli(occurredAtMS).UTC(),
|
|
}, nil
|
|
}
|
|
|
|
// transitionPlan encodes one allowed CAS pair. nil-transition events
|
|
// only update the summary.
|
|
type transitionPlan struct {
|
|
from runtime.Status
|
|
to runtime.Status
|
|
}
|
|
|
|
// eventPlan is the decoded reaction to one event_type.
|
|
type eventPlan struct {
|
|
summary string
|
|
transition *transitionPlan
|
|
}
|
|
|
|
// planFor returns the eventPlan registered for eventType. The boolean
|
|
// reports whether the type is recognised.
|
|
func planFor(eventType string) (eventPlan, bool) {
|
|
switch eventType {
|
|
case eventTypeContainerStarted:
|
|
return eventPlan{summary: summaryHealthy}, true
|
|
case eventTypeProbeRecovered:
|
|
return eventPlan{
|
|
summary: summaryHealthy,
|
|
transition: &transitionPlan{
|
|
from: runtime.StatusEngineUnreachable,
|
|
to: runtime.StatusRunning,
|
|
},
|
|
}, true
|
|
case eventTypeProbeFailed:
|
|
return eventPlan{summary: summaryProbeFailed}, true
|
|
case eventTypeInspectUnhealthy:
|
|
return eventPlan{summary: summaryInspectUnhealthy}, true
|
|
case eventTypeContainerExited:
|
|
return eventPlan{
|
|
summary: summaryExited,
|
|
transition: &transitionPlan{
|
|
from: runtime.StatusRunning,
|
|
to: runtime.StatusEngineUnreachable,
|
|
},
|
|
}, true
|
|
case eventTypeContainerOOM:
|
|
return eventPlan{
|
|
summary: summaryOOM,
|
|
transition: &transitionPlan{
|
|
from: runtime.StatusRunning,
|
|
to: runtime.StatusEngineUnreachable,
|
|
},
|
|
}, true
|
|
case eventTypeContainerDisappeared:
|
|
return eventPlan{
|
|
summary: summaryDisappeared,
|
|
transition: &transitionPlan{
|
|
from: runtime.StatusRunning,
|
|
to: runtime.StatusEngineUnreachable,
|
|
},
|
|
}, true
|
|
default:
|
|
return eventPlan{}, false
|
|
}
|
|
}
|
|
|
|
func optionalString(values map[string]any, key string) string {
|
|
raw, ok := values[key]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
switch typed := raw.(type) {
|
|
case string:
|
|
return typed
|
|
case []byte:
|
|
return string(typed)
|
|
default:
|
|
return ""
|
|
}
|
|
}
|