// Package dockerinspect runs the periodic Docker inspect described in // `rtmanager/README.md §Health Monitoring`. // // On every tick the worker lists `runtime_records.status=running`, // inspects each container, and emits `inspect_unhealthy` when any of // the following holds: // // - `RestartCount` increased between observations (delta detection // requires a prior observation; the first inspect of a record only // records the baseline); // - `State.Status != "running"`; // - `State.Health.Status == "unhealthy"` (only meaningful when the // image declares a Docker HEALTHCHECK). // // `ErrContainerNotFound` is left to the reconciler — the inspect // worker logs and skips so that `container_disappeared` emission // stays single-sourced (Docker events listener + reconciler). // // Per-game state is pruned at the start of every tick against the // freshly-read running list, so a stopped or removed game never // carries a stale baseline into a new lifecycle. package dockerinspect import ( "context" "encoding/json" "errors" "log/slog" "sync" "time" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/telemetry" ) // dockerStateRunning is the verbatim Docker `State.Status` value the // worker treats as healthy. const dockerStateRunning = "running" // dockerHealthUnhealthy is the verbatim Docker `State.Health.Status` // value the worker treats as unhealthy. const dockerHealthUnhealthy = "unhealthy" // Dependencies groups the collaborators required by Worker. type Dependencies struct { // Docker provides the InspectContainer surface. Docker ports.DockerClient // RuntimeRecords lists running games on every tick. RuntimeRecords ports.RuntimeRecordStore // HealthEvents emits `inspect_unhealthy` entries. HealthEvents ports.HealthEventPublisher // Telemetry records one health-event counter per emission. Telemetry *telemetry.Runtime // Interval bounds the tick period. Interval time.Duration // Clock supplies the wall-clock used for emission timestamps. // Defaults to `time.Now` when nil. Clock func() time.Time // Logger receives structured worker-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger } // Worker drives the periodic inspect loop. type Worker struct { docker ports.DockerClient runtimeRecords ports.RuntimeRecordStore healthEvents ports.HealthEventPublisher telemetry *telemetry.Runtime interval time.Duration clock func() time.Time logger *slog.Logger mu sync.Mutex states map[string]*inspectState } // inspectState stores the per-game baseline. Owned by Worker and // protected by Worker.mu. type inspectState struct { lastRestartCount int seen bool } // NewWorker constructs one Worker from deps. func NewWorker(deps Dependencies) (*Worker, error) { switch { case deps.Docker == nil: return nil, errors.New("new docker inspect worker: nil docker client") case deps.RuntimeRecords == nil: return nil, errors.New("new docker inspect worker: nil runtime records store") case deps.HealthEvents == nil: return nil, errors.New("new docker inspect worker: nil health events publisher") case deps.Telemetry == nil: return nil, errors.New("new docker inspect worker: nil telemetry runtime") case deps.Interval <= 0: return nil, errors.New("new docker inspect worker: interval must be positive") } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } return &Worker{ docker: deps.Docker, runtimeRecords: deps.RuntimeRecords, healthEvents: deps.HealthEvents, telemetry: deps.Telemetry, interval: deps.Interval, clock: clock, logger: logger.With("worker", "rtmanager.dockerinspect"), states: map[string]*inspectState{}, }, nil } // Run drives the inspect loop until ctx is cancelled. Per-tick errors // are absorbed; the loop only exits on context cancellation. func (worker *Worker) Run(ctx context.Context) error { if worker == nil { return errors.New("run docker inspect worker: nil worker") } if ctx == nil { return errors.New("run docker inspect worker: nil context") } if err := ctx.Err(); err != nil { return err } worker.logger.Info("docker inspect worker started", "interval", worker.interval.String(), ) defer worker.logger.Info("docker inspect worker stopped") ticker := time.NewTicker(worker.interval) defer ticker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: worker.tick(ctx) } } } // Shutdown is a no-op; Run terminates on context cancellation. func (worker *Worker) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown docker inspect worker: nil context") } return nil } // Tick performs one inspect pass. Exported so tests can drive the // worker deterministically without spinning a real ticker. func (worker *Worker) Tick(ctx context.Context) { worker.tick(ctx) } // tick performs one full pass: list running records, prune state for // stopped games, then inspect every running container sequentially. // Inspect calls are cheap; sequential execution avoids fan-out against // the Docker daemon. func (worker *Worker) tick(ctx context.Context) { if err := ctx.Err(); err != nil { return } records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) if err != nil { worker.logger.WarnContext(ctx, "list running records", "err", err.Error(), ) return } worker.pruneStates(records) for _, record := range records { if err := ctx.Err(); err != nil { return } worker.inspectOne(ctx, record) } } // pruneStates removes per-game baselines for games no longer in the // running list. func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) { worker.mu.Lock() defer worker.mu.Unlock() if len(worker.states) == 0 { return } running := make(map[string]struct{}, len(records)) for _, record := range records { running[record.GameID] = struct{}{} } for gameID := range worker.states { if _, ok := running[gameID]; !ok { delete(worker.states, gameID) } } } // inspectOne issues one InspectContainer call and emits // `inspect_unhealthy` when the observation crosses any of the three // trigger conditions. The first observation of a record only seeds the // baseline; deltas need at least two ticks. func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) { inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID) if err != nil { if errors.Is(err, ports.ErrContainerNotFound) { worker.logger.DebugContext(ctx, "inspect skipped: container missing", "game_id", record.GameID, "container_id", record.CurrentContainerID, ) return } worker.logger.WarnContext(ctx, "inspect failed", "game_id", record.GameID, "container_id", record.CurrentContainerID, "err", err.Error(), ) return } worker.mu.Lock() state, ok := worker.states[record.GameID] if !ok { state = &inspectState{} worker.states[record.GameID] = state } prev := *state state.lastRestartCount = inspect.RestartCount state.seen = true worker.mu.Unlock() emit := false switch { case prev.seen && inspect.RestartCount > prev.lastRestartCount: emit = true case inspect.Status != dockerStateRunning: emit = true case inspect.Health == dockerHealthUnhealthy: emit = true } if !emit { return } worker.publish(ctx, ports.HealthEventEnvelope{ GameID: record.GameID, ContainerID: record.CurrentContainerID, EventType: health.EventTypeInspectUnhealthy, OccurredAt: worker.clock().UTC(), Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health), }) } // publish emits one envelope through the configured publisher, updates // the telemetry counter, and logs the outcome. Failures degrade to a // warning log per `rtmanager/README.md §Notification Contracts`. func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) { if err := worker.healthEvents.Publish(ctx, envelope); err != nil { worker.logger.ErrorContext(ctx, "publish health event", "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), "err", err.Error(), ) return } worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) logArgs := []any{ "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) worker.logger.InfoContext(ctx, "inspect event published", logArgs...) } // inspectUnhealthyDetails builds the JSON payload required by the // `inspect_unhealthy` AsyncAPI variant. All three fields are required // even when their value is the zero value. func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage { payload := struct { RestartCount int `json:"restart_count"` State string `json:"state"` Health string `json:"health"` }{ RestartCount: restartCount, State: state, Health: health, } encoded, _ := json.Marshal(payload) return encoded }