feat: runtime manager
This commit is contained in:
@@ -0,0 +1,678 @@
|
||||
// Package reconcile implements the drift reconciliation worker
|
||||
// described in `rtmanager/README.md §Reconciliation`. The reconciler
|
||||
// is the single authority that brings `runtime_records` into agreement
|
||||
// with the Docker daemon's view of `com.galaxy.owner=rtmanager`
|
||||
// containers.
|
||||
//
|
||||
// Three drift kinds are handled:
|
||||
//
|
||||
// - Adopt — a running container labelled `com.galaxy.owner=rtmanager`
|
||||
// has no matching `runtime_records` row. The reconciler inserts a
|
||||
// `status=running` record (`op_kind=reconcile_adopt`).
|
||||
// - Dispose — a `status=running` row whose `current_container_id` is
|
||||
// no longer reported by Docker. The reconciler updates the row to
|
||||
// `status=removed`, publishes `runtime:health_events`
|
||||
// `container_disappeared`, and appends `reconcile_dispose`.
|
||||
// - Observed exited — a `status=running` row whose container exists
|
||||
// but reports `State.Status=exited`. The reconciler transitions
|
||||
// the row to `status=stopped` and publishes `container_exited`
|
||||
// with the observed exit code. No `operation_log` entry is written
|
||||
// because `OpKind` does not include a value for this transition;
|
||||
// it is reflected in `rtmanager.reconcile_drift{kind=observed_exited}`
|
||||
// instead.
|
||||
//
|
||||
// All write decisions for a given `game_id` are guarded by the per-game
|
||||
// Redis lease; the read pass that lists Docker containers and PG
|
||||
// records is lockless.
|
||||
//
|
||||
// The reconciler runs once synchronously at process start
|
||||
// (`ReconcileNow`) before any other worker is allowed to start, and
|
||||
// then periodically via `Run` as an `app.Component`. Design rationale
|
||||
// is captured in `rtmanager/docs/workers.md`.
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// dockerStateRunning is the verbatim Docker `State.Status` value the
|
||||
// reconciler treats as "the container is alive".
|
||||
const dockerStateRunning = "running"
|
||||
|
||||
// dockerStateExited is the verbatim Docker `State.Status` value the
|
||||
// reconciler treats as "the container has terminated".
|
||||
const dockerStateExited = "exited"
|
||||
|
||||
// driftKindAdopt / driftKindDispose / driftKindObservedExited match the
|
||||
// `kind` label vocabulary on `rtmanager.reconcile_drift`.
|
||||
const (
|
||||
driftKindAdopt = "adopt"
|
||||
driftKindDispose = "dispose"
|
||||
driftKindObservedExited = "observed_exited"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
||||
// background context is used so the release runs even if the request
|
||||
// context was already canceled.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Dependencies groups the collaborators required by Reconciler.
|
||||
type Dependencies struct {
|
||||
Docker ports.DockerClient
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
DockerCfg config.DockerConfig
|
||||
ContainerCfg config.ContainerConfig
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Interval bounds the periodic tick. ReconcileNow ignores it.
|
||||
Interval time.Duration
|
||||
|
||||
Clock func() time.Time
|
||||
Logger *slog.Logger
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Reconciler drives both the synchronous initial pass and the periodic
|
||||
// drift reconciliation loop.
|
||||
type Reconciler struct {
|
||||
docker ports.DockerClient
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
leases ports.GameLeaseStore
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
dockerNetwork string
|
||||
stateRoot string
|
||||
leaseTTL time.Duration
|
||||
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewReconciler constructs one Reconciler from deps.
|
||||
func NewReconciler(deps Dependencies) (*Reconciler, error) {
|
||||
switch {
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new reconciler: nil docker client")
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new reconciler: nil runtime records store")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new reconciler: nil operation log store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new reconciler: nil health events publisher")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new reconciler: nil lease store")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new reconciler: nil telemetry runtime")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new reconciler: interval must be positive")
|
||||
}
|
||||
if err := deps.DockerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: docker config: %w", err)
|
||||
}
|
||||
if err := deps.ContainerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: container config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new reconciler: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Reconciler{
|
||||
docker: deps.Docker,
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
healthEvents: deps.HealthEvents,
|
||||
leases: deps.Leases,
|
||||
telemetry: deps.Telemetry,
|
||||
dockerNetwork: deps.DockerCfg.Network,
|
||||
stateRoot: deps.ContainerCfg.GameStateRoot,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.reconcile"),
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ReconcileNow performs one full reconciliation pass synchronously.
|
||||
// It is intended for the startup path described in
|
||||
// `rtmanager/README.md §Startup dependencies` (step 6). Per-game
|
||||
// errors are absorbed into telemetry and logs; only ctx errors are
|
||||
// surfaced to the caller so a cancelled startup aborts immediately.
|
||||
func (reconciler *Reconciler) ReconcileNow(ctx context.Context) error {
|
||||
if reconciler == nil {
|
||||
return errors.New("reconcile now: nil reconciler")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("reconcile now: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
reconciler.tick(ctx)
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
// Run drives the periodic reconciliation loop. It does not perform an
|
||||
// immediate first pass — `ReconcileNow` covers that path; the first
|
||||
// tick fires after `Interval`. Run terminates on context cancellation.
|
||||
func (reconciler *Reconciler) Run(ctx context.Context) error {
|
||||
if reconciler == nil {
|
||||
return errors.New("run reconciler: nil reconciler")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run reconciler: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
reconciler.logger.Info("reconciler started",
|
||||
"interval", reconciler.interval.String(),
|
||||
)
|
||||
defer reconciler.logger.Info("reconciler stopped")
|
||||
|
||||
ticker := time.NewTicker(reconciler.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
reconciler.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (reconciler *Reconciler) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown reconciler: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one reconciliation pass. Exported so tests can drive
|
||||
// the reconciler deterministically without spinning a real ticker.
|
||||
func (reconciler *Reconciler) Tick(ctx context.Context) {
|
||||
reconciler.tick(ctx)
|
||||
}
|
||||
|
||||
// tick executes one full pass: list Docker containers + PG records,
|
||||
// resolve drift, and apply lease-guarded mutations for each affected
|
||||
// game.
|
||||
func (reconciler *Reconciler) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
containers, err := reconciler.docker.List(ctx, ports.ListFilter{
|
||||
Labels: map[string]string{startruntime.LabelOwner: startruntime.LabelOwnerValue},
|
||||
})
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "list owned containers",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
records, err := reconciler.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "list running records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
containerByGame := make(map[string]ports.ContainerSummary, len(containers))
|
||||
for _, summary := range containers {
|
||||
gameID := summary.Labels[startruntime.LabelGameID]
|
||||
if gameID == "" {
|
||||
continue
|
||||
}
|
||||
containerByGame[gameID] = summary
|
||||
}
|
||||
|
||||
recordByGame := make(map[string]runtime.RuntimeRecord, len(records))
|
||||
for _, record := range records {
|
||||
recordByGame[record.GameID] = record
|
||||
}
|
||||
|
||||
for gameID, summary := range containerByGame {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
if _, ok := recordByGame[gameID]; ok {
|
||||
continue
|
||||
}
|
||||
if summary.Status != dockerStateRunning {
|
||||
continue
|
||||
}
|
||||
reconciler.adoptOne(ctx, gameID, summary)
|
||||
}
|
||||
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
summary, ok := containerByGame[record.GameID]
|
||||
if !ok {
|
||||
reconciler.disposeOne(ctx, record)
|
||||
continue
|
||||
}
|
||||
if summary.ID != record.CurrentContainerID {
|
||||
continue
|
||||
}
|
||||
if summary.Status == dockerStateExited {
|
||||
reconciler.observedExitedOne(ctx, record, summary)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// adoptOne installs a `runtime_records` row for an unrecorded running
|
||||
// container under the per-game lease.
|
||||
func (reconciler *Reconciler) adoptOne(ctx context.Context, gameID string, summary ports.ContainerSummary) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, gameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "adopt: acquire lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "adopt: lease busy, skipping",
|
||||
"game_id", gameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, gameID, token)
|
||||
|
||||
if _, err := reconciler.runtimeRecords.Get(ctx, gameID); err == nil {
|
||||
reconciler.logger.InfoContext(ctx, "adopt: record appeared concurrently, skipping",
|
||||
"game_id", gameID,
|
||||
)
|
||||
return
|
||||
} else if !errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.WarnContext(ctx, "adopt: read record",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
startedAt := reconciler.resolveStartedAt(ctx, summary)
|
||||
imageRef := summary.Labels[startruntime.LabelEngineImageRef]
|
||||
if imageRef == "" {
|
||||
imageRef = summary.ImageRef
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
createdAt := now
|
||||
if startedAt.Before(createdAt) {
|
||||
createdAt = startedAt
|
||||
}
|
||||
record := runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: summary.ID,
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: reconciler.engineEndpoint(gameID),
|
||||
StatePath: filepath.Join(reconciler.stateRoot, gameID),
|
||||
DockerNetwork: reconciler.dockerNetwork,
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: now,
|
||||
CreatedAt: createdAt,
|
||||
}
|
||||
if err := reconciler.runtimeRecords.Upsert(ctx, record); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "adopt: upsert record",
|
||||
"game_id", gameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
finishedAt := reconciler.clock().UTC()
|
||||
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: gameID,
|
||||
OpKind: operation.OpKindReconcileAdopt,
|
||||
OpSource: operation.OpSourceAutoReconcile,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: summary.ID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: now,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindAdopt)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", gameID,
|
||||
"container_id", summary.ID,
|
||||
"image_ref", imageRef,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler adopted unrecorded container", logArgs...)
|
||||
}
|
||||
|
||||
// disposeOne transitions a `running` record whose container is missing
|
||||
// in Docker to `removed` and publishes `container_disappeared`.
|
||||
func (reconciler *Reconciler) disposeOne(ctx context.Context, record runtime.RuntimeRecord) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "dispose: acquire lease",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: lease busy, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, record.GameID, token)
|
||||
|
||||
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return
|
||||
}
|
||||
reconciler.logger.WarnContext(ctx, "dispose: read record",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if current.Status != runtime.StatusRunning || current.CurrentContainerID != record.CurrentContainerID {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: state changed, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: record.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: now,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.InfoContext(ctx, "dispose: CAS lost, skipping",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "dispose: update status",
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: now,
|
||||
Details: containerDisappearedDetails(),
|
||||
})
|
||||
|
||||
finishedAt := reconciler.clock().UTC()
|
||||
reconciler.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: record.GameID,
|
||||
OpKind: operation.OpKindReconcileDispose,
|
||||
OpSource: operation.OpSourceAutoReconcile,
|
||||
ImageRef: record.CurrentImageRef,
|
||||
ContainerID: record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: now,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindDispose)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", record.GameID,
|
||||
"container_id", record.CurrentContainerID,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler disposed missing container", logArgs...)
|
||||
}
|
||||
|
||||
// observedExitedOne transitions a `running` record whose container is
|
||||
// reported as `exited` to `stopped` and publishes `container_exited`
|
||||
// with the observed exit code. No `operation_log` entry is written;
|
||||
// see decision record §6.
|
||||
func (reconciler *Reconciler) observedExitedOne(ctx context.Context, record runtime.RuntimeRecord, summary ports.ContainerSummary) {
|
||||
token := reconciler.newToken()
|
||||
acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: acquire lease",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if !acquired {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: lease busy, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer reconciler.releaseLease(ctx, record.GameID, token)
|
||||
|
||||
current, err := reconciler.runtimeRecords.Get(ctx, record.GameID)
|
||||
if err != nil {
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return
|
||||
}
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: read record",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if current.Status != runtime.StatusRunning || current.CurrentContainerID != summary.ID {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: state changed, skipping",
|
||||
"game_id", record.GameID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
||||
if err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "observed_exited: inspect container",
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
now := reconciler.clock().UTC()
|
||||
err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: summary.ID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: now,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) {
|
||||
reconciler.logger.InfoContext(ctx, "observed_exited: CAS lost, skipping",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "observed_exited: update status",
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: record.GameID,
|
||||
ContainerID: summary.ID,
|
||||
EventType: health.EventTypeContainerExited,
|
||||
OccurredAt: now,
|
||||
Details: containerExitedDetails(inspect.ExitCode, inspect.OOMKilled),
|
||||
})
|
||||
reconciler.telemetry.RecordReconcileDrift(ctx, driftKindObservedExited)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", record.GameID,
|
||||
"container_id", summary.ID,
|
||||
"exit_code", inspect.ExitCode,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
reconciler.logger.InfoContext(ctx, "reconciler observed exited container", logArgs...)
|
||||
}
|
||||
|
||||
// resolveStartedAt prefers the `com.galaxy.started_at_ms` label written
|
||||
// by the start service. When the label is absent or unparseable, it
|
||||
// falls back to a full inspect of the container; if inspect also fails
|
||||
// or returns a zero StartedAt, the current clock is used so the record
|
||||
// still validates.
|
||||
func (reconciler *Reconciler) resolveStartedAt(ctx context.Context, summary ports.ContainerSummary) time.Time {
|
||||
if raw, ok := summary.Labels[startruntime.LabelStartedAtMs]; ok && raw != "" {
|
||||
if ms, err := strconv.ParseInt(raw, 10, 64); err == nil && ms > 0 {
|
||||
return time.UnixMilli(ms).UTC()
|
||||
}
|
||||
}
|
||||
inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID)
|
||||
if err == nil && !inspect.StartedAt.IsZero() {
|
||||
return inspect.StartedAt.UTC()
|
||||
}
|
||||
return reconciler.clock().UTC()
|
||||
}
|
||||
|
||||
// engineEndpoint mirrors the URL shape produced by the docker adapter
|
||||
// (`internal/adapters/docker/client.go::Run`).
|
||||
func (reconciler *Reconciler) engineEndpoint(gameID string) string {
|
||||
return fmt.Sprintf("http://%s%s:8080", startruntime.HostnamePrefix, gameID)
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background
|
||||
// context so a canceled tick context does not leave the lease pinned
|
||||
// for its TTL.
|
||||
func (reconciler *Reconciler) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := reconciler.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
reconciler.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (reconciler *Reconciler) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := reconciler.operationLogs.Append(ctx, entry); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (reconciler *Reconciler) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := reconciler.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
reconciler.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
reconciler.telemetry.RecordHealthEvent(ctx, string(envelope.EventType))
|
||||
}
|
||||
|
||||
// containerExitedDetails matches the JSON shape produced by the events
|
||||
// listener so consumers see a single contracted payload regardless of
|
||||
// the source.
|
||||
func containerExitedDetails(exitCode int, oom bool) json.RawMessage {
|
||||
payload := struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}{ExitCode: exitCode, OOM: oom}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// containerDisappearedDetails returns the canonical empty-object
|
||||
// payload required by the `container_disappeared` AsyncAPI variant.
|
||||
func containerDisappearedDetails() json.RawMessage {
|
||||
return json.RawMessage(`{}`)
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,740 @@
|
||||
package reconcile_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/reconcile"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
func silentLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
|
||||
// --- fake doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
listErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Set(records ...runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
for _, record := range records {
|
||||
s.stored[record.GameID] = record
|
||||
}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
t := input.Now
|
||||
record.StoppedAt = &t
|
||||
case runtime.StatusRemoved:
|
||||
t := input.Now
|
||||
record.RemovedAt = &t
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in reconciler tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
var out []runtime.RuntimeRecord
|
||||
for _, record := range s.stored {
|
||||
if record.Status == status {
|
||||
out = append(out, record)
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upserts() []runtime.RuntimeRecord {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]runtime.RuntimeRecord, len(s.upserts))
|
||||
copy(out, s.upserts)
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Updates() []ports.UpdateStatusInput {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.UpdateStatusInput, len(s.updates))
|
||||
copy(out, s.updates)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in reconciler tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Appends() []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]operation.OperationEntry, len(s.appends))
|
||||
copy(out, s.appends)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
published []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.publishErr != nil {
|
||||
return s.publishErr
|
||||
}
|
||||
s.published = append(s.published, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeHealthEvents) Published() []ports.HealthEventEnvelope {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]ports.HealthEventEnvelope, len(s.published))
|
||||
copy(out, s.published)
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, gameID, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, gameID+":"+token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, gameID, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, gameID+":"+token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Acquires() []string {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
out := make([]string, len(l.acquires))
|
||||
copy(out, l.acquires)
|
||||
return out
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Releases() []string {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
out := make([]string, len(l.releases))
|
||||
copy(out, l.releases)
|
||||
return out
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
docker *mocks.MockDockerClient
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
healthEvents *fakeHealthEvents
|
||||
leases *fakeLeases
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
leases: &fakeLeases{acquired: true},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *reconcile.Reconciler {
|
||||
t.Helper()
|
||||
r, err := reconcile.NewReconciler(reconcile.Dependencies{
|
||||
Docker: h.docker,
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
HealthEvents: h.healthEvents,
|
||||
Leases: h.leases,
|
||||
Telemetry: h.telemetry,
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
ContainerCfg: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Interval: 50 * time.Millisecond,
|
||||
Clock: func() time.Time { return h.now },
|
||||
Logger: silentLogger(),
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return r
|
||||
}
|
||||
|
||||
// runningRecord builds a baseline runtime record in `running` state.
|
||||
func runningRecord(gameID, containerID string, startedAt time.Time) runtime.RuntimeRecord {
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: containerID,
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func ownedSummary(gameID, containerID, imageRef, status string, startedAtMs int64) ports.ContainerSummary {
|
||||
labels := map[string]string{
|
||||
startruntime.LabelOwner: startruntime.LabelOwnerValue,
|
||||
startruntime.LabelKind: startruntime.LabelKindValue,
|
||||
startruntime.LabelGameID: gameID,
|
||||
startruntime.LabelEngineImageRef: imageRef,
|
||||
}
|
||||
if startedAtMs > 0 {
|
||||
labels[startruntime.LabelStartedAtMs] = strconv.FormatInt(startedAtMs, 10)
|
||||
}
|
||||
return ports.ContainerSummary{
|
||||
ID: containerID,
|
||||
ImageRef: imageRef,
|
||||
Hostname: "galaxy-game-" + gameID,
|
||||
Labels: labels,
|
||||
Status: status,
|
||||
StartedAt: time.UnixMilli(startedAtMs).UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ------------------------------------------------------
|
||||
|
||||
func TestNewReconcilerRejectsMissingDeps(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
coord := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
base := reconcile.Dependencies{
|
||||
Docker: mocks.NewMockDockerClient(ctrl),
|
||||
RuntimeRecords: newFakeRuntimeRecords(),
|
||||
OperationLogs: &fakeOperationLogs{},
|
||||
HealthEvents: &fakeHealthEvents{},
|
||||
Leases: &fakeLeases{acquired: true},
|
||||
Telemetry: telemetryRuntime,
|
||||
DockerCfg: dockerCfg,
|
||||
ContainerCfg: containerCfg,
|
||||
Coordination: coord,
|
||||
Interval: time.Second,
|
||||
}
|
||||
|
||||
defectives := []reconcile.Dependencies{
|
||||
{},
|
||||
{Docker: base.Docker},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases},
|
||||
{Docker: base.Docker, RuntimeRecords: base.RuntimeRecords, OperationLogs: base.OperationLogs, HealthEvents: base.HealthEvents, Leases: base.Leases, Telemetry: base.Telemetry},
|
||||
}
|
||||
for index, deps := range defectives {
|
||||
_, err := reconcile.NewReconciler(deps)
|
||||
require.Errorf(t, err, "case %d should fail", index)
|
||||
}
|
||||
|
||||
_, err = reconcile.NewReconciler(base)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// --- adopt ------------------------------------------------------------
|
||||
|
||||
func TestReconcileAdoptInsertsRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 30, 0, 0, time.UTC)
|
||||
summary := ownedSummary("game-a", "ctr-game-a", "galaxy/game:1.2.3", "running", startedAt.UnixMilli())
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
upserts := h.records.Upserts()
|
||||
require.Len(t, upserts, 1)
|
||||
got := upserts[0]
|
||||
assert.Equal(t, "game-a", got.GameID)
|
||||
assert.Equal(t, runtime.StatusRunning, got.Status)
|
||||
assert.Equal(t, "ctr-game-a", got.CurrentContainerID)
|
||||
assert.Equal(t, "galaxy/game:1.2.3", got.CurrentImageRef)
|
||||
assert.Equal(t, "http://galaxy-game-game-a:8080", got.EngineEndpoint)
|
||||
assert.Equal(t, "/var/lib/galaxy/games/game-a", got.StatePath)
|
||||
assert.Equal(t, "galaxy-net", got.DockerNetwork)
|
||||
require.NotNil(t, got.StartedAt)
|
||||
assert.True(t, got.StartedAt.Equal(startedAt))
|
||||
|
||||
appends := h.operationLogs.Appends()
|
||||
require.Len(t, appends, 1)
|
||||
assert.Equal(t, operation.OpKindReconcileAdopt, appends[0].OpKind)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource)
|
||||
assert.Equal(t, operation.OutcomeSuccess, appends[0].Outcome)
|
||||
assert.Equal(t, "ctr-game-a", appends[0].ContainerID)
|
||||
|
||||
assert.Equal(t, []string{"game-a:token-A"}, h.leases.Acquires())
|
||||
assert.Equal(t, []string{"game-a:token-A"}, h.leases.Releases())
|
||||
assert.Empty(t, h.healthEvents.Published(), "adopt does not publish health events")
|
||||
}
|
||||
|
||||
func TestReconcileAdoptFallsBackToInspectStartedAtWhenLabelMissing(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
summary := ownedSummary("game-b", "ctr-game-b", "galaxy/game:1.0.0", "running", 0)
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
inspectStarted := time.Date(2026, 4, 28, 10, 0, 0, 0, time.UTC)
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-b").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-b",
|
||||
StartedAt: inspectStarted,
|
||||
Status: "running",
|
||||
}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
upserts := h.records.Upserts()
|
||||
require.Len(t, upserts, 1)
|
||||
require.NotNil(t, upserts[0].StartedAt)
|
||||
assert.True(t, upserts[0].StartedAt.Equal(inspectStarted))
|
||||
}
|
||||
|
||||
func TestReconcileAdoptSkipsWhenRecordAppearsConcurrently(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-c", "ctr-game-c", startedAt))
|
||||
|
||||
// Docker reports the same game running, but the record already
|
||||
// exists (start service won the race). The list pass sees the
|
||||
// record, so adopt path is never entered.
|
||||
summary := ownedSummary("game-c", "ctr-game-c", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
assert.Empty(t, h.leases.Acquires(), "no mutation -> no lease acquired")
|
||||
}
|
||||
|
||||
func TestReconcileAdoptSkipsNonRunningContainer(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
summary := ownedSummary("game-d", "ctr-game-d", "galaxy/game:1.0.0", "exited", time.Now().UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts(), "exited container without record is not adopted")
|
||||
assert.Empty(t, h.leases.Acquires())
|
||||
}
|
||||
|
||||
// --- dispose ----------------------------------------------------------
|
||||
|
||||
func TestReconcileDisposeMarksRemoved(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-e", "ctr-game-e", startedAt))
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
updates := h.records.Updates()
|
||||
require.Len(t, updates, 1)
|
||||
assert.Equal(t, "game-e", updates[0].GameID)
|
||||
assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom)
|
||||
assert.Equal(t, "ctr-game-e", updates[0].ExpectedContainerID)
|
||||
assert.Equal(t, runtime.StatusRemoved, updates[0].To)
|
||||
|
||||
published := h.healthEvents.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, published[0].EventType)
|
||||
assert.Equal(t, "game-e", published[0].GameID)
|
||||
assert.Equal(t, "ctr-game-e", published[0].ContainerID)
|
||||
assert.JSONEq(t, `{}`, string(published[0].Details))
|
||||
|
||||
appends := h.operationLogs.Appends()
|
||||
require.Len(t, appends, 1)
|
||||
assert.Equal(t, operation.OpKindReconcileDispose, appends[0].OpKind)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, appends[0].OpSource)
|
||||
}
|
||||
|
||||
func TestReconcileDisposeSkipsOnCASConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-f", "ctr-game-f", startedAt))
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.healthEvents.Published(), "no health event when CAS lost")
|
||||
assert.Empty(t, h.operationLogs.Appends(), "no operation_log entry when CAS lost")
|
||||
}
|
||||
|
||||
func TestReconcileDisposeSkipsWhenStateChangedAfterReread(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
// Running record observed by ListByStatus, but Get under the lease
|
||||
// returns a record whose status has changed.
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
listed := runningRecord("game-g", "ctr-game-g", startedAt)
|
||||
h.records.Set(listed)
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
// Mutate the stored record to simulate concurrent stop completing
|
||||
// between the list pass and the lease re-read. The fake's Get
|
||||
// observes the mutated state.
|
||||
h.records.mu.Lock()
|
||||
stoppedAt := startedAt.Add(time.Minute)
|
||||
listed.Status = runtime.StatusStopped
|
||||
listed.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-g"] = listed
|
||||
h.records.mu.Unlock()
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "re-read sees status != running -> skip")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
}
|
||||
|
||||
// --- observed_exited --------------------------------------------------
|
||||
|
||||
func TestReconcileObservedExitedMarksStopped(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-h", "ctr-game-h", startedAt))
|
||||
|
||||
summary := ownedSummary("game-h", "ctr-game-h", "galaxy/game:1.0.0", "exited", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
h.docker.EXPECT().InspectContainer(gomock.Any(), "ctr-game-h").Return(ports.ContainerInspect{
|
||||
ID: "ctr-game-h",
|
||||
Status: "exited",
|
||||
ExitCode: 137,
|
||||
OOMKilled: false,
|
||||
}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
updates := h.records.Updates()
|
||||
require.Len(t, updates, 1)
|
||||
assert.Equal(t, runtime.StatusRunning, updates[0].ExpectedFrom)
|
||||
assert.Equal(t, "ctr-game-h", updates[0].ExpectedContainerID)
|
||||
assert.Equal(t, runtime.StatusStopped, updates[0].To)
|
||||
|
||||
published := h.healthEvents.Published()
|
||||
require.Len(t, published, 1)
|
||||
assert.Equal(t, health.EventTypeContainerExited, published[0].EventType)
|
||||
var details struct {
|
||||
ExitCode int `json:"exit_code"`
|
||||
OOM bool `json:"oom"`
|
||||
}
|
||||
require.NoError(t, json.Unmarshal(published[0].Details, &details))
|
||||
assert.Equal(t, 137, details.ExitCode)
|
||||
assert.False(t, details.OOM)
|
||||
|
||||
assert.Empty(t, h.operationLogs.Appends(), "observed_exited writes no operation_log entry")
|
||||
}
|
||||
|
||||
// --- no-op paths ------------------------------------------------------
|
||||
|
||||
func TestReconcileNoDriftIsNoop(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-i", "ctr-game-i", startedAt))
|
||||
|
||||
summary := ownedSummary("game-i", "ctr-game-i", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
assert.Empty(t, h.records.Updates())
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.operationLogs.Appends())
|
||||
assert.Empty(t, h.leases.Acquires())
|
||||
}
|
||||
|
||||
func TestReconcileSkipsWhenContainerIDMismatch(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-j", "ctr-old", startedAt))
|
||||
|
||||
// Docker reports the new container id; restart is in flight.
|
||||
summary := ownedSummary("game-j", "ctr-new", "galaxy/game:1.0.0", "running", startedAt.UnixMilli())
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return([]ports.ContainerSummary{summary}, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "id mismatch -> reconciler stays out of the way")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
}
|
||||
|
||||
// --- lease busy / errors ----------------------------------------------
|
||||
|
||||
func TestReconcileLeaseConflictSkipsGame(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
r := h.build(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 28, 11, 0, 0, 0, time.UTC)
|
||||
h.records.Set(runningRecord("game-k", "ctr-game-k", startedAt))
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
r.Tick(context.Background())
|
||||
|
||||
assert.Empty(t, h.records.Updates(), "lease busy -> dispose skipped")
|
||||
assert.Empty(t, h.healthEvents.Published())
|
||||
assert.Empty(t, h.leases.Releases(), "release not called when acquire returned false")
|
||||
}
|
||||
|
||||
func TestReconcileNowAbsorbsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, errors.New("docker daemon down"))
|
||||
|
||||
require.NoError(t, r.ReconcileNow(context.Background()))
|
||||
assert.Empty(t, h.records.Updates())
|
||||
assert.Empty(t, h.records.Upserts())
|
||||
}
|
||||
|
||||
func TestReconcileNowAbsorbsRecordsListError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
h.records.listErr = errors.New("pg down")
|
||||
|
||||
h.docker.EXPECT().List(gomock.Any(), gomock.Any()).Return(nil, nil)
|
||||
|
||||
require.NoError(t, r.ReconcileNow(context.Background()))
|
||||
}
|
||||
|
||||
func TestReconcileNowReturnsContextError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
require.ErrorIs(t, r.ReconcileNow(ctx), context.Canceled)
|
||||
}
|
||||
|
||||
// --- Run lifecycle ----------------------------------------------------
|
||||
|
||||
func TestRunRespectsContextCancel(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- r.Run(ctx) }()
|
||||
|
||||
cancel()
|
||||
select {
|
||||
case err := <-done:
|
||||
assert.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("Run did not exit after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShutdownIsNoOp(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
r := h.build(t)
|
||||
require.NoError(t, r.Shutdown(context.Background()))
|
||||
}
|
||||
|
||||
// --- compile-time safety ----------------------------------------------
|
||||
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
_ ports.OperationLogStore = (*fakeOperationLogs)(nil)
|
||||
_ ports.HealthEventPublisher = (*fakeHealthEvents)(nil)
|
||||
_ ports.GameLeaseStore = (*fakeLeases)(nil)
|
||||
)
|
||||
Reference in New Issue
Block a user