// Package containercleanup ships the periodic TTL-cleanup worker // described in `rtmanager/README.md §Lifecycles → Cleanup`. // // On every tick the worker lists `runtime_records.status='stopped'` // rows whose `last_op_at` is older than the configured retention // (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to // `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The // service owns the per-game lease, the Docker `Remove` call, the // status transition, the telemetry counter, and the operation_log // entry; this worker is intentionally tiny — a ticker plus a TTL // filter. // // Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a // failure on one game does not abort the rest of the pass. // // Design rationale is captured in // `rtmanager/docs/workers.md`. package containercleanup import ( "context" "errors" "log/slog" "time" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/cleanupcontainer" ) // Cleaner is the narrow surface the worker uses to remove stopped // containers. The production `*cleanupcontainer.Service` satisfies // this interface verbatim; the package keeps the surface here so // tests can substitute a fake without spinning the full service. type Cleaner interface { Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error) } // Dependencies groups the collaborators required by Worker. type Dependencies struct { // RuntimeRecords lists `status=stopped` records on every tick. RuntimeRecords ports.RuntimeRecordStore // Cleanup performs the actual container removal under the per-game // lease. Cleanup Cleaner // Retention is the TTL after which a stopped container becomes a // removal candidate. Mirrors `cfg.Container.Retention`. Retention time.Duration // Interval bounds the tick period. Mirrors // `cfg.Cleanup.CleanupInterval`. Interval time.Duration // Clock supplies the wall-clock used to compute the TTL threshold. // Defaults to `time.Now` when nil. Clock func() time.Time // Logger receives structured worker-level events. Defaults to // `slog.Default()` when nil. Logger *slog.Logger } // Worker drives the periodic TTL-cleanup loop. type Worker struct { runtimeRecords ports.RuntimeRecordStore cleanup Cleaner retention time.Duration interval time.Duration clock func() time.Time logger *slog.Logger } // NewWorker constructs one Worker from deps. func NewWorker(deps Dependencies) (*Worker, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new container cleanup worker: nil runtime records store") case deps.Cleanup == nil: return nil, errors.New("new container cleanup worker: nil cleanup service") case deps.Retention <= 0: return nil, errors.New("new container cleanup worker: retention must be positive") case deps.Interval <= 0: return nil, errors.New("new container cleanup worker: interval must be positive") } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } return &Worker{ runtimeRecords: deps.RuntimeRecords, cleanup: deps.Cleanup, retention: deps.Retention, interval: deps.Interval, clock: clock, logger: logger.With("worker", "rtmanager.containercleanup"), }, nil } // Run drives the cleanup loop until ctx is cancelled. Per-tick errors // are absorbed; the loop only exits on context cancellation. func (worker *Worker) Run(ctx context.Context) error { if worker == nil { return errors.New("run container cleanup worker: nil worker") } if ctx == nil { return errors.New("run container cleanup worker: nil context") } if err := ctx.Err(); err != nil { return err } worker.logger.Info("container cleanup worker started", "interval", worker.interval.String(), "retention", worker.retention.String(), ) defer worker.logger.Info("container cleanup worker stopped") ticker := time.NewTicker(worker.interval) defer ticker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: worker.tick(ctx) } } } // Shutdown is a no-op; Run terminates on context cancellation. func (worker *Worker) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown container cleanup worker: nil context") } return nil } // Tick performs one cleanup pass. Exported so tests can drive the // worker deterministically without spinning a real ticker. func (worker *Worker) Tick(ctx context.Context) { worker.tick(ctx) } // tick lists stopped records and delegates removal of expired ones to // the cleanup service. func (worker *Worker) tick(ctx context.Context) { if err := ctx.Err(); err != nil { return } records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped) if err != nil { worker.logger.WarnContext(ctx, "list stopped records", "err", err.Error(), ) return } threshold := worker.clock().Add(-worker.retention) for _, record := range records { if err := ctx.Err(); err != nil { return } if !record.LastOpAt.Before(threshold) { continue } result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{ GameID: record.GameID, OpSource: operation.OpSourceAutoTTL, }) if err != nil { worker.logger.ErrorContext(ctx, "cleanup handle returned error", "game_id", record.GameID, "err", err.Error(), ) continue } if result.Outcome == operation.OutcomeFailure { worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome", "game_id", record.GameID, "error_code", result.ErrorCode, "error_message", result.ErrorMessage, ) continue } worker.logger.InfoContext(ctx, "cleanup ttl removed container", "game_id", record.GameID, "error_code", result.ErrorCode, ) } }