319 lines
9.2 KiB
Go
319 lines
9.2 KiB
Go
// Package dockerinspect runs the periodic Docker inspect described in
|
|
// `rtmanager/README.md §Health Monitoring`.
|
|
//
|
|
// On every tick the worker lists `runtime_records.status=running`,
|
|
// inspects each container, and emits `inspect_unhealthy` when any of
|
|
// the following holds:
|
|
//
|
|
// - `RestartCount` increased between observations (delta detection
|
|
// requires a prior observation; the first inspect of a record only
|
|
// records the baseline);
|
|
// - `State.Status != "running"`;
|
|
// - `State.Health.Status == "unhealthy"` (only meaningful when the
|
|
// image declares a Docker HEALTHCHECK).
|
|
//
|
|
// `ErrContainerNotFound` is left to the reconciler — the inspect
|
|
// worker logs and skips so that `container_disappeared` emission
|
|
// stays single-sourced (Docker events listener + reconciler).
|
|
//
|
|
// Per-game state is pruned at the start of every tick against the
|
|
// freshly-read running list, so a stopped or removed game never
|
|
// carries a stale baseline into a new lifecycle.
|
|
package dockerinspect
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/logging"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
)
|
|
|
|
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
|
// worker treats as healthy.
|
|
const dockerStateRunning = "running"
|
|
|
|
// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status`
|
|
// value the worker treats as unhealthy.
|
|
const dockerHealthUnhealthy = "unhealthy"
|
|
|
|
// Dependencies groups the collaborators required by Worker.
|
|
type Dependencies struct {
|
|
// Docker provides the InspectContainer surface.
|
|
Docker ports.DockerClient
|
|
|
|
// RuntimeRecords lists running games on every tick.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// HealthEvents emits `inspect_unhealthy` entries.
|
|
HealthEvents ports.HealthEventPublisher
|
|
|
|
// Telemetry records one health-event counter per emission.
|
|
Telemetry *telemetry.Runtime
|
|
|
|
// Interval bounds the tick period.
|
|
Interval time.Duration
|
|
|
|
// Clock supplies the wall-clock used for emission timestamps.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
|
|
// Logger receives structured worker-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// Worker drives the periodic inspect loop.
|
|
type Worker struct {
|
|
docker ports.DockerClient
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
healthEvents ports.HealthEventPublisher
|
|
telemetry *telemetry.Runtime
|
|
|
|
interval time.Duration
|
|
|
|
clock func() time.Time
|
|
logger *slog.Logger
|
|
|
|
mu sync.Mutex
|
|
states map[string]*inspectState
|
|
}
|
|
|
|
// inspectState stores the per-game baseline. Owned by Worker and
|
|
// protected by Worker.mu.
|
|
type inspectState struct {
|
|
lastRestartCount int
|
|
seen bool
|
|
}
|
|
|
|
// NewWorker constructs one Worker from deps.
|
|
func NewWorker(deps Dependencies) (*Worker, error) {
|
|
switch {
|
|
case deps.Docker == nil:
|
|
return nil, errors.New("new docker inspect worker: nil docker client")
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new docker inspect worker: nil runtime records store")
|
|
case deps.HealthEvents == nil:
|
|
return nil, errors.New("new docker inspect worker: nil health events publisher")
|
|
case deps.Telemetry == nil:
|
|
return nil, errors.New("new docker inspect worker: nil telemetry runtime")
|
|
case deps.Interval <= 0:
|
|
return nil, errors.New("new docker inspect worker: interval must be positive")
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
return &Worker{
|
|
docker: deps.Docker,
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
healthEvents: deps.HealthEvents,
|
|
telemetry: deps.Telemetry,
|
|
interval: deps.Interval,
|
|
clock: clock,
|
|
logger: logger.With("worker", "rtmanager.dockerinspect"),
|
|
states: map[string]*inspectState{},
|
|
}, nil
|
|
}
|
|
|
|
// Run drives the inspect loop until ctx is cancelled. Per-tick errors
|
|
// are absorbed; the loop only exits on context cancellation.
|
|
func (worker *Worker) Run(ctx context.Context) error {
|
|
if worker == nil {
|
|
return errors.New("run docker inspect worker: nil worker")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run docker inspect worker: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
worker.logger.Info("docker inspect worker started",
|
|
"interval", worker.interval.String(),
|
|
)
|
|
defer worker.logger.Info("docker inspect worker stopped")
|
|
|
|
ticker := time.NewTicker(worker.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
worker.tick(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; Run terminates on context cancellation.
|
|
func (worker *Worker) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown docker inspect worker: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Tick performs one inspect pass. Exported so tests can drive the
|
|
// worker deterministically without spinning a real ticker.
|
|
func (worker *Worker) Tick(ctx context.Context) {
|
|
worker.tick(ctx)
|
|
}
|
|
|
|
// tick performs one full pass: list running records, prune state for
|
|
// stopped games, then inspect every running container sequentially.
|
|
// Inspect calls are cheap; sequential execution avoids fan-out against
|
|
// the Docker daemon.
|
|
func (worker *Worker) tick(ctx context.Context) {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
|
|
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
|
if err != nil {
|
|
worker.logger.WarnContext(ctx, "list running records",
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
worker.pruneStates(records)
|
|
|
|
for _, record := range records {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
worker.inspectOne(ctx, record)
|
|
}
|
|
}
|
|
|
|
// pruneStates removes per-game baselines for games no longer in the
|
|
// running list.
|
|
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
|
|
worker.mu.Lock()
|
|
defer worker.mu.Unlock()
|
|
if len(worker.states) == 0 {
|
|
return
|
|
}
|
|
running := make(map[string]struct{}, len(records))
|
|
for _, record := range records {
|
|
running[record.GameID] = struct{}{}
|
|
}
|
|
for gameID := range worker.states {
|
|
if _, ok := running[gameID]; !ok {
|
|
delete(worker.states, gameID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// inspectOne issues one InspectContainer call and emits
|
|
// `inspect_unhealthy` when the observation crosses any of the three
|
|
// trigger conditions. The first observation of a record only seeds the
|
|
// baseline; deltas need at least two ticks.
|
|
func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) {
|
|
inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID)
|
|
if err != nil {
|
|
if errors.Is(err, ports.ErrContainerNotFound) {
|
|
worker.logger.DebugContext(ctx, "inspect skipped: container missing",
|
|
"game_id", record.GameID,
|
|
"container_id", record.CurrentContainerID,
|
|
)
|
|
return
|
|
}
|
|
worker.logger.WarnContext(ctx, "inspect failed",
|
|
"game_id", record.GameID,
|
|
"container_id", record.CurrentContainerID,
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
worker.mu.Lock()
|
|
state, ok := worker.states[record.GameID]
|
|
if !ok {
|
|
state = &inspectState{}
|
|
worker.states[record.GameID] = state
|
|
}
|
|
prev := *state
|
|
state.lastRestartCount = inspect.RestartCount
|
|
state.seen = true
|
|
worker.mu.Unlock()
|
|
|
|
emit := false
|
|
switch {
|
|
case prev.seen && inspect.RestartCount > prev.lastRestartCount:
|
|
emit = true
|
|
case inspect.Status != dockerStateRunning:
|
|
emit = true
|
|
case inspect.Health == dockerHealthUnhealthy:
|
|
emit = true
|
|
}
|
|
if !emit {
|
|
return
|
|
}
|
|
|
|
worker.publish(ctx, ports.HealthEventEnvelope{
|
|
GameID: record.GameID,
|
|
ContainerID: record.CurrentContainerID,
|
|
EventType: health.EventTypeInspectUnhealthy,
|
|
OccurredAt: worker.clock().UTC(),
|
|
Details: inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health),
|
|
})
|
|
}
|
|
|
|
// publish emits one envelope through the configured publisher, updates
|
|
// the telemetry counter, and logs the outcome. Failures degrade to a
|
|
// warning log per `rtmanager/README.md §Notification Contracts`.
|
|
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
|
if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
|
|
worker.logger.ErrorContext(ctx, "publish health event",
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
|
|
|
logArgs := []any{
|
|
"game_id", envelope.GameID,
|
|
"container_id", envelope.ContainerID,
|
|
"event_type", string(envelope.EventType),
|
|
}
|
|
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
|
worker.logger.InfoContext(ctx, "inspect event published", logArgs...)
|
|
}
|
|
|
|
// inspectUnhealthyDetails builds the JSON payload required by the
|
|
// `inspect_unhealthy` AsyncAPI variant. All three fields are required
|
|
// even when their value is the zero value.
|
|
func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage {
|
|
payload := struct {
|
|
RestartCount int `json:"restart_count"`
|
|
State string `json:"state"`
|
|
Health string `json:"health"`
|
|
}{
|
|
RestartCount: restartCount,
|
|
State: state,
|
|
Health: health,
|
|
}
|
|
encoded, _ := json.Marshal(payload)
|
|
return encoded
|
|
}
|