// Package reconcile implements the drift reconciliation worker // described in `rtmanager/README.md §Reconciliation`. The reconciler // is the single authority that brings `runtime_records` into agreement // with the Docker daemon's view of `com.galaxy.owner=rtmanager` // containers. // // Three drift kinds are handled: // // - Adopt — a running container labelled `com.galaxy.owner=rtmanager` // has no matching `runtime_records` row. The reconciler inserts a // `status=running` record (`op_kind=reconcile_adopt`). // - Dispose — a `status=running` row whose `current_container_id` is // no longer reported by Docker. The reconciler updates the row to // `status=removed`, publishes `runtime:health_events` // `container_disappeared`, and appends `reconcile_dispose`. // - Observed exited — a `status=running` row whose container exists // but reports `State.Status=exited`. The reconciler transitions // the row to `status=stopped` and publishes `container_exited` // with the observed exit code. No `operation_log` entry is written // because `OpKind` does not include a value for this transition; // it is reflected in `rtmanager.reconcile_drift{kind=observed_exited}` // instead. // // All write decisions for a given `game_id` are guarded by the per-game // Redis lease; the read pass that lists Docker containers and PG // records is lockless. // // The reconciler runs once synchronously at process start // (`ReconcileNow`) before any other worker is allowed to start, and // then periodically via `Run` as an `app.Component`. Design rationale // is captured in `rtmanager/docs/workers.md`. package reconcile import ( "context" "crypto/rand" "encoding/base64" "encoding/json" "errors" "fmt" "log/slog" "path/filepath" "strconv" "time" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "galaxy/rtmanager/internal/telemetry" ) // dockerStateRunning is the verbatim Docker `State.Status` value the // reconciler treats as "the container is alive". const dockerStateRunning = "running" // dockerStateExited is the verbatim Docker `State.Status` value the // reconciler treats as "the container has terminated". const dockerStateExited = "exited" // driftKindAdopt / driftKindDispose / driftKindObservedExited match the // `kind` label vocabulary on `rtmanager.reconcile_drift`. const ( driftKindAdopt = "adopt" driftKindDispose = "dispose" driftKindObservedExited = "observed_exited" ) // leaseReleaseTimeout bounds the deferred lease-release call. A fresh // background context is used so the release runs even if the request // context was already canceled. const leaseReleaseTimeout = 5 * time.Second // Dependencies groups the collaborators required by Reconciler. type Dependencies struct { Docker ports.DockerClient RuntimeRecords ports.RuntimeRecordStore OperationLogs ports.OperationLogStore HealthEvents ports.HealthEventPublisher Leases ports.GameLeaseStore Telemetry *telemetry.Runtime DockerCfg config.DockerConfig ContainerCfg config.ContainerConfig Coordination config.CoordinationConfig // Interval bounds the periodic tick. ReconcileNow ignores it. Interval time.Duration Clock func() time.Time Logger *slog.Logger NewToken func() string } // Reconciler drives both the synchronous initial pass and the periodic // drift reconciliation loop. type Reconciler struct { docker ports.DockerClient runtimeRecords ports.RuntimeRecordStore operationLogs ports.OperationLogStore healthEvents ports.HealthEventPublisher leases ports.GameLeaseStore telemetry *telemetry.Runtime dockerNetwork string stateRoot string leaseTTL time.Duration interval time.Duration clock func() time.Time logger *slog.Logger newToken func() string } // NewReconciler constructs one Reconciler from deps. func NewReconciler(deps Dependencies) (*Reconciler, error) { switch { case deps.Docker == nil: return nil, errors.New("new reconciler: nil docker client") case deps.RuntimeRecords == nil: return nil, errors.New("new reconciler: nil runtime records store") case deps.OperationLogs == nil: return nil, errors.New("new reconciler: nil operation log store") case deps.HealthEvents == nil: return nil, errors.New("new reconciler: nil health events publisher") case deps.Leases == nil: return nil, errors.New("new reconciler: nil lease store") case deps.Telemetry == nil: return nil, errors.New("new reconciler: nil telemetry runtime") case deps.Interval <= 0: return nil, errors.New("new reconciler: interval must be positive") } if err := deps.DockerCfg.Validate(); err != nil { return nil, fmt.Errorf("new reconciler: docker config: %w", err) } if err := deps.ContainerCfg.Validate(); err != nil { return nil, fmt.Errorf("new reconciler: container config: %w", err) } if err := deps.Coordination.Validate(); err != nil { return nil, fmt.Errorf("new reconciler: coordination config: %w", err) } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } newToken := deps.NewToken if newToken == nil { newToken = defaultTokenGenerator() } return &Reconciler{ docker: deps.Docker, runtimeRecords: deps.RuntimeRecords, operationLogs: deps.OperationLogs, healthEvents: deps.HealthEvents, leases: deps.Leases, telemetry: deps.Telemetry, dockerNetwork: deps.DockerCfg.Network, stateRoot: deps.ContainerCfg.GameStateRoot, leaseTTL: deps.Coordination.GameLeaseTTL, interval: deps.Interval, clock: clock, logger: logger.With("worker", "rtmanager.reconcile"), newToken: newToken, }, nil } // ReconcileNow performs one full reconciliation pass synchronously. // It is intended for the startup path described in // `rtmanager/README.md §Startup dependencies` (step 6). Per-game // errors are absorbed into telemetry and logs; only ctx errors are // surfaced to the caller so a cancelled startup aborts immediately. func (reconciler *Reconciler) ReconcileNow(ctx context.Context) error { if reconciler == nil { return errors.New("reconcile now: nil reconciler") } if ctx == nil { return errors.New("reconcile now: nil context") } if err := ctx.Err(); err != nil { return err } reconciler.tick(ctx) return ctx.Err() } // Run drives the periodic reconciliation loop. It does not perform an // immediate first pass — `ReconcileNow` covers that path; the first // tick fires after `Interval`. Run terminates on context cancellation. func (reconciler *Reconciler) Run(ctx context.Context) error { if reconciler == nil { return errors.New("run reconciler: nil reconciler") } if ctx == nil { return errors.New("run reconciler: nil context") } if err := ctx.Err(); err != nil { return err } reconciler.logger.Info("reconciler started", "interval", reconciler.interval.String(), ) defer reconciler.logger.Info("reconciler stopped") ticker := time.NewTicker(reconciler.interval) defer ticker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: reconciler.tick(ctx) } } } // Shutdown is a no-op; Run terminates on context cancellation. func (reconciler *Reconciler) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown reconciler: nil context") } return nil } // Tick performs one reconciliation pass. Exported so tests can drive // the reconciler deterministically without spinning a real ticker. func (reconciler *Reconciler) Tick(ctx context.Context) { reconciler.tick(ctx) } // tick executes one full pass: list Docker containers + PG records, // resolve drift, and apply lease-guarded mutations for each affected // game. func (reconciler *Reconciler) tick(ctx context.Context) { if err := ctx.Err(); err != nil { return } containers, err := reconciler.docker.List(ctx, ports.ListFilter{ Labels: map[string]string{startruntime.LabelOwner: startruntime.LabelOwnerValue}, }) if err != nil { reconciler.logger.WarnContext(ctx, "list owned containers", "err", err.Error(), ) return } records, err := reconciler.runtimeRecords.ListByStatus(ctx, runtime.StatusRunning) if err != nil { reconciler.logger.WarnContext(ctx, "list running records", "err", err.Error(), ) return } containerByGame := make(map[string]ports.ContainerSummary, len(containers)) for _, summary := range containers { gameID := summary.Labels[startruntime.LabelGameID] if gameID == "" { continue } containerByGame[gameID] = summary } recordByGame := make(map[string]runtime.RuntimeRecord, len(records)) for _, record := range records { recordByGame[record.GameID] = record } for gameID, summary := range containerByGame { if err := ctx.Err(); err != nil { return } if _, ok := recordByGame[gameID]; ok { continue } if summary.Status != dockerStateRunning { continue } reconciler.adoptOne(ctx, gameID, summary) } for _, record := range records { if err := ctx.Err(); err != nil { return } summary, ok := containerByGame[record.GameID] if !ok { reconciler.disposeOne(ctx, record) continue } if summary.ID != record.CurrentContainerID { continue } if summary.Status == dockerStateExited { reconciler.observedExitedOne(ctx, record, summary) } } } // adoptOne installs a `runtime_records` row for an unrecorded running // container under the per-game lease. func (reconciler *Reconciler) adoptOne(ctx context.Context, gameID string, summary ports.ContainerSummary) { token := reconciler.newToken() acquired, err := reconciler.leases.TryAcquire(ctx, gameID, token, reconciler.leaseTTL) if err != nil { reconciler.logger.WarnContext(ctx, "adopt: acquire lease", "game_id", gameID, "err", err.Error(), ) return } if !acquired { reconciler.logger.InfoContext(ctx, "adopt: lease busy, skipping", "game_id", gameID, ) return } defer reconciler.releaseLease(ctx, gameID, token) if _, err := reconciler.runtimeRecords.Get(ctx, gameID); err == nil { reconciler.logger.InfoContext(ctx, "adopt: record appeared concurrently, skipping", "game_id", gameID, ) return } else if !errors.Is(err, runtime.ErrNotFound) { reconciler.logger.WarnContext(ctx, "adopt: read record", "game_id", gameID, "err", err.Error(), ) return } startedAt := reconciler.resolveStartedAt(ctx, summary) imageRef := summary.Labels[startruntime.LabelEngineImageRef] if imageRef == "" { imageRef = summary.ImageRef } now := reconciler.clock().UTC() createdAt := now if startedAt.Before(createdAt) { createdAt = startedAt } record := runtime.RuntimeRecord{ GameID: gameID, Status: runtime.StatusRunning, CurrentContainerID: summary.ID, CurrentImageRef: imageRef, EngineEndpoint: reconciler.engineEndpoint(gameID), StatePath: filepath.Join(reconciler.stateRoot, gameID), DockerNetwork: reconciler.dockerNetwork, StartedAt: &startedAt, LastOpAt: now, CreatedAt: createdAt, } if err := reconciler.runtimeRecords.Upsert(ctx, record); err != nil { reconciler.logger.ErrorContext(ctx, "adopt: upsert record", "game_id", gameID, "container_id", summary.ID, "err", err.Error(), ) return } finishedAt := reconciler.clock().UTC() reconciler.bestEffortAppend(ctx, operation.OperationEntry{ GameID: gameID, OpKind: operation.OpKindReconcileAdopt, OpSource: operation.OpSourceAutoReconcile, ImageRef: imageRef, ContainerID: summary.ID, Outcome: operation.OutcomeSuccess, StartedAt: now, FinishedAt: &finishedAt, }) reconciler.telemetry.RecordReconcileDrift(ctx, driftKindAdopt) logArgs := []any{ "game_id", gameID, "container_id", summary.ID, "image_ref", imageRef, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) reconciler.logger.InfoContext(ctx, "reconciler adopted unrecorded container", logArgs...) } // disposeOne transitions a `running` record whose container is missing // in Docker to `removed` and publishes `container_disappeared`. func (reconciler *Reconciler) disposeOne(ctx context.Context, record runtime.RuntimeRecord) { token := reconciler.newToken() acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL) if err != nil { reconciler.logger.WarnContext(ctx, "dispose: acquire lease", "game_id", record.GameID, "err", err.Error(), ) return } if !acquired { reconciler.logger.InfoContext(ctx, "dispose: lease busy, skipping", "game_id", record.GameID, ) return } defer reconciler.releaseLease(ctx, record.GameID, token) current, err := reconciler.runtimeRecords.Get(ctx, record.GameID) if err != nil { if errors.Is(err, runtime.ErrNotFound) { return } reconciler.logger.WarnContext(ctx, "dispose: read record", "game_id", record.GameID, "err", err.Error(), ) return } if current.Status != runtime.StatusRunning || current.CurrentContainerID != record.CurrentContainerID { reconciler.logger.InfoContext(ctx, "dispose: state changed, skipping", "game_id", record.GameID, ) return } now := reconciler.clock().UTC() err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: record.GameID, ExpectedFrom: runtime.StatusRunning, ExpectedContainerID: record.CurrentContainerID, To: runtime.StatusRemoved, Now: now, }) if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) { reconciler.logger.InfoContext(ctx, "dispose: CAS lost, skipping", "game_id", record.GameID, "err", err.Error(), ) return } if err != nil { reconciler.logger.ErrorContext(ctx, "dispose: update status", "game_id", record.GameID, "container_id", record.CurrentContainerID, "err", err.Error(), ) return } reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ GameID: record.GameID, ContainerID: record.CurrentContainerID, EventType: health.EventTypeContainerDisappeared, OccurredAt: now, Details: containerDisappearedDetails(), }) finishedAt := reconciler.clock().UTC() reconciler.bestEffortAppend(ctx, operation.OperationEntry{ GameID: record.GameID, OpKind: operation.OpKindReconcileDispose, OpSource: operation.OpSourceAutoReconcile, ImageRef: record.CurrentImageRef, ContainerID: record.CurrentContainerID, Outcome: operation.OutcomeSuccess, StartedAt: now, FinishedAt: &finishedAt, }) reconciler.telemetry.RecordReconcileDrift(ctx, driftKindDispose) logArgs := []any{ "game_id", record.GameID, "container_id", record.CurrentContainerID, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) reconciler.logger.InfoContext(ctx, "reconciler disposed missing container", logArgs...) } // observedExitedOne transitions a `running` record whose container is // reported as `exited` to `stopped` and publishes `container_exited` // with the observed exit code. No `operation_log` entry is written; // see decision record §6. func (reconciler *Reconciler) observedExitedOne(ctx context.Context, record runtime.RuntimeRecord, summary ports.ContainerSummary) { token := reconciler.newToken() acquired, err := reconciler.leases.TryAcquire(ctx, record.GameID, token, reconciler.leaseTTL) if err != nil { reconciler.logger.WarnContext(ctx, "observed_exited: acquire lease", "game_id", record.GameID, "err", err.Error(), ) return } if !acquired { reconciler.logger.InfoContext(ctx, "observed_exited: lease busy, skipping", "game_id", record.GameID, ) return } defer reconciler.releaseLease(ctx, record.GameID, token) current, err := reconciler.runtimeRecords.Get(ctx, record.GameID) if err != nil { if errors.Is(err, runtime.ErrNotFound) { return } reconciler.logger.WarnContext(ctx, "observed_exited: read record", "game_id", record.GameID, "err", err.Error(), ) return } if current.Status != runtime.StatusRunning || current.CurrentContainerID != summary.ID { reconciler.logger.InfoContext(ctx, "observed_exited: state changed, skipping", "game_id", record.GameID, ) return } inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID) if err != nil { reconciler.logger.WarnContext(ctx, "observed_exited: inspect container", "game_id", record.GameID, "container_id", summary.ID, "err", err.Error(), ) return } now := reconciler.clock().UTC() err = reconciler.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{ GameID: record.GameID, ExpectedFrom: runtime.StatusRunning, ExpectedContainerID: summary.ID, To: runtime.StatusStopped, Now: now, }) if errors.Is(err, runtime.ErrConflict) || errors.Is(err, runtime.ErrNotFound) { reconciler.logger.InfoContext(ctx, "observed_exited: CAS lost, skipping", "game_id", record.GameID, "err", err.Error(), ) return } if err != nil { reconciler.logger.ErrorContext(ctx, "observed_exited: update status", "game_id", record.GameID, "container_id", summary.ID, "err", err.Error(), ) return } reconciler.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{ GameID: record.GameID, ContainerID: summary.ID, EventType: health.EventTypeContainerExited, OccurredAt: now, Details: containerExitedDetails(inspect.ExitCode, inspect.OOMKilled), }) reconciler.telemetry.RecordReconcileDrift(ctx, driftKindObservedExited) logArgs := []any{ "game_id", record.GameID, "container_id", summary.ID, "exit_code", inspect.ExitCode, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) reconciler.logger.InfoContext(ctx, "reconciler observed exited container", logArgs...) } // resolveStartedAt prefers the `com.galaxy.started_at_ms` label written // by the start service. When the label is absent or unparseable, it // falls back to a full inspect of the container; if inspect also fails // or returns a zero StartedAt, the current clock is used so the record // still validates. func (reconciler *Reconciler) resolveStartedAt(ctx context.Context, summary ports.ContainerSummary) time.Time { if raw, ok := summary.Labels[startruntime.LabelStartedAtMs]; ok && raw != "" { if ms, err := strconv.ParseInt(raw, 10, 64); err == nil && ms > 0 { return time.UnixMilli(ms).UTC() } } inspect, err := reconciler.docker.InspectContainer(ctx, summary.ID) if err == nil && !inspect.StartedAt.IsZero() { return inspect.StartedAt.UTC() } return reconciler.clock().UTC() } // engineEndpoint mirrors the URL shape produced by the docker adapter // (`internal/adapters/docker/client.go::Run`). func (reconciler *Reconciler) engineEndpoint(gameID string) string { return fmt.Sprintf("http://%s%s:8080", startruntime.HostnamePrefix, gameID) } // releaseLease releases the per-game lease in a fresh background // context so a canceled tick context does not leave the lease pinned // for its TTL. func (reconciler *Reconciler) releaseLease(ctx context.Context, gameID, token string) { cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) defer cancel() if err := reconciler.leases.Release(cleanupCtx, gameID, token); err != nil { reconciler.logger.WarnContext(ctx, "release game lease", "game_id", gameID, "err", err.Error(), ) } } // bestEffortAppend writes one operation_log entry. A failure is logged // and discarded; the durable runtime record (or its absence) remains // the source of truth. func (reconciler *Reconciler) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { if _, err := reconciler.operationLogs.Append(ctx, entry); err != nil { reconciler.logger.ErrorContext(ctx, "append operation log", "game_id", entry.GameID, "op_kind", string(entry.OpKind), "err", err.Error(), ) } } // bestEffortPublishHealth emits one health event + snapshot upsert. // Failures degrade silently per `rtmanager/README.md §Notification // Contracts`; the runtime record remains the source of truth. func (reconciler *Reconciler) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) { if err := reconciler.healthEvents.Publish(ctx, envelope); err != nil { reconciler.logger.ErrorContext(ctx, "publish health event", "game_id", envelope.GameID, "container_id", envelope.ContainerID, "event_type", string(envelope.EventType), "err", err.Error(), ) return } reconciler.telemetry.RecordHealthEvent(ctx, string(envelope.EventType)) } // containerExitedDetails matches the JSON shape produced by the events // listener so consumers see a single contracted payload regardless of // the source. func containerExitedDetails(exitCode int, oom bool) json.RawMessage { payload := struct { ExitCode int `json:"exit_code"` OOM bool `json:"oom"` }{ExitCode: exitCode, OOM: oom} encoded, _ := json.Marshal(payload) return encoded } // containerDisappearedDetails returns the canonical empty-object // payload required by the `container_disappeared` AsyncAPI variant. func containerDisappearedDetails() json.RawMessage { return json.RawMessage(`{}`) } func defaultTokenGenerator() func() string { return func() string { var buf [32]byte if _, err := rand.Read(buf[:]); err != nil { return "rtmanager-fallback-token" } return base64.RawURLEncoding.EncodeToString(buf[:]) } }