Files
galaxy-game/rtmanager/internal/worker/dockerinspect/worker.go
T
2026-04-28 20:39:18 +02:00

319 lines
9.2 KiB
Go

// Package dockerinspect runs the periodic Docker inspect described in
// `rtmanager/README.md §Health Monitoring`.
//
// On every tick the worker lists `runtime_records.status=running`,
// inspects each container, and emits `inspect_unhealthy` when any of
// the following holds:
//
// - `RestartCount` increased between observations (delta detection
// requires a prior observation; the first inspect of a record only
// records the baseline);
// - `State.Status != "running"`;
// - `State.Health.Status == "unhealthy"` (only meaningful when the
// image declares a Docker HEALTHCHECK).
//
// `ErrContainerNotFound` is left to the reconciler — the inspect
// worker logs and skips so that `container_disappeared` emission
// stays single-sourced (Docker events listener + reconciler).
//
// Per-game state is pruned at the start of every tick against the
// freshly-read running list, so a stopped or removed game never
// carries a stale baseline into a new lifecycle.
package dockerinspect
import (
"context"
"encoding/json"
"errors"
"log/slog"
"sync"
"time"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/logging"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/telemetry"
)
// dockerStateRunning is the verbatim Docker `State.Status` value the
// worker treats as healthy.
const dockerStateRunning = "running"
// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status`
// value the worker treats as unhealthy.
const dockerHealthUnhealthy = "unhealthy"
// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
// Docker provides the InspectContainer surface.
Docker ports.DockerClient
// RuntimeRecords lists running games on every tick.
RuntimeRecords ports.RuntimeRecordStore
// HealthEvents emits `inspect_unhealthy` entries.
HealthEvents ports.HealthEventPublisher
// Telemetry records one health-event counter per emission.
Telemetry *telemetry.Runtime
// Interval bounds the tick period.
Interval time.Duration
// Clock supplies the wall-clock used for emission timestamps.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
}
// Worker drives the periodic inspect loop.
type Worker struct {
docker ports.DockerClient
runtimeRecords ports.RuntimeRecordStore
healthEvents ports.HealthEventPublisher
telemetry *telemetry.Runtime
interval time.Duration
clock func() time.Time
logger *slog.Logger
mu sync.Mutex
states map[string]*inspectState
}
// inspectState stores the per-game baseline. Owned by Worker and
// protected by Worker.mu.
type inspectState struct {
lastRestartCount int
seen bool
}
// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
switch {
case deps.Docker == nil:
return nil, errors.New("new docker inspect worker: nil docker client")
case deps.RuntimeRecords == nil:
return nil, errors.New("new docker inspect worker: nil runtime records store")
case deps.HealthEvents == nil:
return nil, errors.New("new docker inspect worker: nil health events publisher")
case deps.Telemetry == nil:
return nil, errors.New("new docker inspect worker: nil telemetry runtime")
case deps.Interval <= 0:
return nil, errors.New("new docker inspect worker: interval must be positive")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
return &Worker{
docker: deps.Docker,
runtimeRecords: deps.RuntimeRecords,
healthEvents: deps.HealthEvents,
telemetry: deps.Telemetry,
interval: deps.Interval,
clock: clock,
logger: logger.With("worker", "rtmanager.dockerinspect"),
states: map[string]*inspectState{},
}, nil
}
// Run drives the inspect loop until ctx is cancelled. Per-tick errors
// are absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
if worker == nil {
return errors.New("run docker inspect worker: nil worker")
}
if ctx == nil {
return errors.New("run docker inspect worker: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
worker.logger.Info("docker inspect worker started",
"interval", worker.interval.String(),
)
defer worker.logger.Info("docker inspect worker stopped")
ticker := time.NewTicker(worker.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
worker.tick(ctx)
}
}
}
// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown docker inspect worker: nil context")
}
return nil
}
// Tick performs one inspect pass. Exported so tests can drive the
// worker deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
worker.tick(ctx)
}
// tick performs one full pass: list running records, prune state for
// stopped games, then inspect every running container sequentially.
// Inspect calls are cheap; sequential execution avoids fan-out against
// the Docker daemon.
func (worker *Worker) tick(ctx context.Context) {
if err := ctx.Err(); err != nil {
return
}
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
if err != nil {
worker.logger.WarnContext(ctx, "list running records",
"err", err.Error(),
)
return
}
worker.pruneStates(records)
for _, record := range records {
if err := ctx.Err(); err != nil {
return
}
worker.inspectOne(ctx, record)
}
}
// pruneStates removes per-game baselines for games no longer in the
// running list.
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
worker.mu.Lock()
defer worker.mu.Unlock()
if len(worker.states) == 0 {
return
}
running := make(map[string]struct{}, len(records))
for _, record := range records {
running[record.GameID] = struct{}{}
}
for gameID := range worker.states {
if _, ok := running[gameID]; !ok {
delete(worker.states, gameID)
}
}
}
// inspectOne issues one InspectContainer call and emits
// `inspect_unhealthy` when the observation crosses any of the three
// trigger conditions. The first observation of a record only seeds the
// baseline; deltas need at least two ticks.
func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) {
inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID)
if err != nil {
if errors.Is(err, ports.ErrContainerNotFound) {
worker.logger.DebugContext(ctx, "inspect skipped: container missing",
"game_id", record.GameID,
"container_id", record.CurrentContainerID,
)
return
}
worker.logger.WarnContext(ctx, "inspect failed",
"game_id", record.GameID,
"container_id", record.CurrentContainerID,
"err", err.Error(),
)
return
}
worker.mu.Lock()
state, ok := worker.states[record.GameID]
if !ok {
state = &inspectState{}
worker.states[record.GameID] = state
}
prev := *state
state.lastRestartCount = inspect.RestartCount
state.seen = true
worker.mu.Unlock()
emit := false
switch {
case prev.seen && inspect.RestartCount > prev.lastRestartCount:
emit = true
case inspect.Status != dockerStateRunning:
emit = true
case inspect.Health == dockerHealthUnhealthy:
emit = true
}
if !emit {
return
}
worker.publish(ctx, ports.HealthEventEnvelope{
GameID: record.GameID,
ContainerID: record.CurrentContainerID,
EventType: health.EventTypeInspectUnhealthy,
OccurredAt: worker.clock().UTC(),
Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health),
})
}
// publish emits one envelope through the configured publisher, updates
// the telemetry counter, and logs the outcome. Failures degrade to a
// warning log per `rtmanager/README.md §Notification Contracts`.
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
worker.logger.ErrorContext(ctx, "publish health event",
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
"err", err.Error(),
)
return
}
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
logArgs := []any{
"game_id", envelope.GameID,
"container_id", envelope.ContainerID,
"event_type", string(envelope.EventType),
}
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
worker.logger.InfoContext(ctx, "inspect event published", logArgs...)
}
// inspectUnhealthyDetails builds the JSON payload required by the
// `inspect_unhealthy` AsyncAPI variant. All three fields are required
// even when their value is the zero value.
func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage {
payload := struct {
RestartCount int `json:"restart_count"`
State string `json:"state"`
Health string `json:"health"`
}{
RestartCount: restartCount,
State: state,
Health: health,
}
encoded, _ := json.Marshal(payload)
return encoded
}