galaxy-game/rtmanager/internal/worker/dockerinspect/worker.go

// Package dockerinspect runs the periodic Docker inspect described in
// `rtmanager/README.md §Health Monitoring`.
//
// On every tick the worker lists `runtime_records.status=running`,
// inspects each container, and emits `inspect_unhealthy` when any of
// the following holds:
//
//   - `RestartCount` increased between observations (delta detection
//     requires a prior observation; the first inspect of a record only
//     records the baseline);
//   - `State.Status != "running"`;
//   - `State.Health.Status == "unhealthy"` (only meaningful when the
//     image declares a Docker HEALTHCHECK).
//
// `ErrContainerNotFound` is left to the reconciler — the inspect
// worker logs and skips so that `container_disappeared` emission
// stays single-sourced (Docker events listener + reconciler).
//
// Per-game state is pruned at the start of every tick against the
// freshly-read running list, so a stopped or removed game never
// carries a stale baseline into a new lifecycle.
package dockerinspect

import (
	"context"
	"encoding/json"
	"errors"
	"log/slog"
	"sync"
	"time"

	"galaxy/rtmanager/internal/domain/health"
	"galaxy/rtmanager/internal/domain/runtime"
	"galaxy/rtmanager/internal/logging"
	"galaxy/rtmanager/internal/ports"
	"galaxy/rtmanager/internal/telemetry"
)

// dockerStateRunning is the verbatim Docker `State.Status` value the
// worker treats as healthy.
const dockerStateRunning = "running"

// dockerHealthUnhealthy is the verbatim Docker `State.Health.Status`
// value the worker treats as unhealthy.
const dockerHealthUnhealthy = "unhealthy"

// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
	// Docker provides the InspectContainer surface.
	Docker ports.DockerClient

	// RuntimeRecords lists running games on every tick.
	RuntimeRecords ports.RuntimeRecordStore

	// HealthEvents emits `inspect_unhealthy` entries.
	HealthEvents ports.HealthEventPublisher

	// Telemetry records one health-event counter per emission.
	Telemetry *telemetry.Runtime

	// Interval bounds the tick period.
	Interval time.Duration

	// Clock supplies the wall-clock used for emission timestamps.
	// Defaults to `time.Now` when nil.
	Clock func() time.Time

	// Logger receives structured worker-level events. Defaults to
	// `slog.Default()` when nil.
	Logger *slog.Logger
}

// Worker drives the periodic inspect loop.
type Worker struct {
	docker         ports.DockerClient
	runtimeRecords ports.RuntimeRecordStore
	healthEvents   ports.HealthEventPublisher
	telemetry      *telemetry.Runtime

	interval time.Duration

	clock  func() time.Time
	logger *slog.Logger

	mu     sync.Mutex
	states map[string]*inspectState
}

// inspectState stores the per-game baseline. Owned by Worker and
// protected by Worker.mu.
type inspectState struct {
	lastRestartCount int
	seen             bool
}

// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
	switch {
	case deps.Docker == nil:
		return nil, errors.New("new docker inspect worker: nil docker client")
	case deps.RuntimeRecords == nil:
		return nil, errors.New("new docker inspect worker: nil runtime records store")
	case deps.HealthEvents == nil:
		return nil, errors.New("new docker inspect worker: nil health events publisher")
	case deps.Telemetry == nil:
		return nil, errors.New("new docker inspect worker: nil telemetry runtime")
	case deps.Interval <= 0:
		return nil, errors.New("new docker inspect worker: interval must be positive")
	}

	clock := deps.Clock
	if clock == nil {
		clock = time.Now
	}
	logger := deps.Logger
	if logger == nil {
		logger = slog.Default()
	}

	return &Worker{
		docker:         deps.Docker,
		runtimeRecords: deps.RuntimeRecords,
		healthEvents:   deps.HealthEvents,
		telemetry:      deps.Telemetry,
		interval:       deps.Interval,
		clock:          clock,
		logger:         logger.With("worker", "rtmanager.dockerinspect"),
		states:         map[string]*inspectState{},
	}, nil
}

// Run drives the inspect loop until ctx is cancelled. Per-tick errors
// are absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
	if worker == nil {
		return errors.New("run docker inspect worker: nil worker")
	}
	if ctx == nil {
		return errors.New("run docker inspect worker: nil context")
	}
	if err := ctx.Err(); err != nil {
		return err
	}

	worker.logger.Info("docker inspect worker started",
		"interval", worker.interval.String(),
	)
	defer worker.logger.Info("docker inspect worker stopped")

	ticker := time.NewTicker(worker.interval)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-ticker.C:
			worker.tick(ctx)
		}
	}
}

// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
	if ctx == nil {
		return errors.New("shutdown docker inspect worker: nil context")
	}
	return nil
}

// Tick performs one inspect pass. Exported so tests can drive the
// worker deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
	worker.tick(ctx)
}

// tick performs one full pass: list running records, prune state for
// stopped games, then inspect every running container sequentially.
// Inspect calls are cheap; sequential execution avoids fan-out against
// the Docker daemon.
func (worker *Worker) tick(ctx context.Context) {
	if err := ctx.Err(); err != nil {
		return
	}

	records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
	if err != nil {
		worker.logger.WarnContext(ctx, "list running records",
			"err", err.Error(),
		)
		return
	}

	worker.pruneStates(records)

	for _, record := range records {
		if err := ctx.Err(); err != nil {
			return
		}
		worker.inspectOne(ctx, record)
	}
}

// pruneStates removes per-game baselines for games no longer in the
// running list.
func (worker *Worker) pruneStates(records []runtime.RuntimeRecord) {
	worker.mu.Lock()
	defer worker.mu.Unlock()
	if len(worker.states) == 0 {
		return
	}
	running := make(map[string]struct{}, len(records))
	for _, record := range records {
		running[record.GameID] = struct{}{}
	}
	for gameID := range worker.states {
		if _, ok := running[gameID]; !ok {
			delete(worker.states, gameID)
		}
	}
}

// inspectOne issues one InspectContainer call and emits
// `inspect_unhealthy` when the observation crosses any of the three
// trigger conditions. The first observation of a record only seeds the
// baseline; deltas need at least two ticks.
func (worker *Worker) inspectOne(ctx context.Context, record runtime.RuntimeRecord) {
	inspect, err := worker.docker.InspectContainer(ctx, record.CurrentContainerID)
	if err != nil {
		if errors.Is(err, ports.ErrContainerNotFound) {
			worker.logger.DebugContext(ctx, "inspect skipped: container missing",
				"game_id", record.GameID,
				"container_id", record.CurrentContainerID,
			)
			return
		}
		worker.logger.WarnContext(ctx, "inspect failed",
			"game_id", record.GameID,
			"container_id", record.CurrentContainerID,
			"err", err.Error(),
		)
		return
	}

	worker.mu.Lock()
	state, ok := worker.states[record.GameID]
	if !ok {
		state = &inspectState{}
		worker.states[record.GameID] = state
	}
	prev := *state
	state.lastRestartCount = inspect.RestartCount
	state.seen = true
	worker.mu.Unlock()

	emit := false
	switch {
	case prev.seen && inspect.RestartCount > prev.lastRestartCount:
		emit = true
	case inspect.Status != dockerStateRunning:
		emit = true
	case inspect.Health == dockerHealthUnhealthy:
		emit = true
	}
	if !emit {
		return
	}

	worker.publish(ctx, ports.HealthEventEnvelope{
		GameID:      record.GameID,
		ContainerID: record.CurrentContainerID,
		EventType:   health.EventTypeInspectUnhealthy,
		OccurredAt:  worker.clock().UTC(),
		Details:     inspectUnhealthyDetails(inspect.RestartCount, inspect.Status, inspect.Health),
	})
}

// publish emits one envelope through the configured publisher, updates
// the telemetry counter, and logs the outcome. Failures degrade to a
// warning log per `rtmanager/README.md §Notification Contracts`.
func (worker *Worker) publish(ctx context.Context, envelope ports.HealthEventEnvelope) {
	if err := worker.healthEvents.Publish(ctx, envelope); err != nil {
		worker.logger.ErrorContext(ctx, "publish health event",
			"game_id", envelope.GameID,
			"container_id", envelope.ContainerID,
			"event_type", string(envelope.EventType),
			"err", err.Error(),
		)
		return
	}

	worker.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))

	logArgs := []any{
		"game_id", envelope.GameID,
		"container_id", envelope.ContainerID,
		"event_type", string(envelope.EventType),
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	worker.logger.InfoContext(ctx, "inspect event published", logArgs...)
}

// inspectUnhealthyDetails builds the JSON payload required by the
// `inspect_unhealthy` AsyncAPI variant. All three fields are required
// even when their value is the zero value.
func inspectUnhealthyDetails(restartCount int, state, health string) json.RawMessage {
	payload := struct {
		RestartCount int    `json:"restart_count"`
		State        string `json:"state"`
		Health       string `json:"health"`
	}{
		RestartCount: restartCount,
		State:        state,
		Health:       health,
	}
	encoded, _ := json.Marshal(payload)
	return encoded
}