205 lines
5.8 KiB
Go
205 lines
5.8 KiB
Go
// Package containercleanup ships the periodic TTL-cleanup worker
|
|
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
|
|
//
|
|
// On every tick the worker lists `runtime_records.status='stopped'`
|
|
// rows whose `last_op_at` is older than the configured retention
|
|
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
|
|
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
|
|
// service owns the per-game lease, the Docker `Remove` call, the
|
|
// status transition, the telemetry counter, and the operation_log
|
|
// entry; this worker is intentionally tiny — a ticker plus a TTL
|
|
// filter.
|
|
//
|
|
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
|
|
// failure on one game does not abort the rest of the pass.
|
|
//
|
|
// Design rationale is captured in
|
|
// `rtmanager/docs/workers.md`.
|
|
package containercleanup
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/domain/operation"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
|
)
|
|
|
|
// Cleaner is the narrow surface the worker uses to remove stopped
|
|
// containers. The production `*cleanupcontainer.Service` satisfies
|
|
// this interface verbatim; the package keeps the surface here so
|
|
// tests can substitute a fake without spinning the full service.
|
|
type Cleaner interface {
|
|
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
|
|
}
|
|
|
|
// Dependencies groups the collaborators required by Worker.
|
|
type Dependencies struct {
|
|
// RuntimeRecords lists `status=stopped` records on every tick.
|
|
RuntimeRecords ports.RuntimeRecordStore
|
|
|
|
// Cleanup performs the actual container removal under the per-game
|
|
// lease.
|
|
Cleanup Cleaner
|
|
|
|
// Retention is the TTL after which a stopped container becomes a
|
|
// removal candidate. Mirrors `cfg.Container.Retention`.
|
|
Retention time.Duration
|
|
|
|
// Interval bounds the tick period. Mirrors
|
|
// `cfg.Cleanup.CleanupInterval`.
|
|
Interval time.Duration
|
|
|
|
// Clock supplies the wall-clock used to compute the TTL threshold.
|
|
// Defaults to `time.Now` when nil.
|
|
Clock func() time.Time
|
|
|
|
// Logger receives structured worker-level events. Defaults to
|
|
// `slog.Default()` when nil.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// Worker drives the periodic TTL-cleanup loop.
|
|
type Worker struct {
|
|
runtimeRecords ports.RuntimeRecordStore
|
|
cleanup Cleaner
|
|
|
|
retention time.Duration
|
|
interval time.Duration
|
|
|
|
clock func() time.Time
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewWorker constructs one Worker from deps.
|
|
func NewWorker(deps Dependencies) (*Worker, error) {
|
|
switch {
|
|
case deps.RuntimeRecords == nil:
|
|
return nil, errors.New("new container cleanup worker: nil runtime records store")
|
|
case deps.Cleanup == nil:
|
|
return nil, errors.New("new container cleanup worker: nil cleanup service")
|
|
case deps.Retention <= 0:
|
|
return nil, errors.New("new container cleanup worker: retention must be positive")
|
|
case deps.Interval <= 0:
|
|
return nil, errors.New("new container cleanup worker: interval must be positive")
|
|
}
|
|
|
|
clock := deps.Clock
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
logger := deps.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
return &Worker{
|
|
runtimeRecords: deps.RuntimeRecords,
|
|
cleanup: deps.Cleanup,
|
|
retention: deps.Retention,
|
|
interval: deps.Interval,
|
|
clock: clock,
|
|
logger: logger.With("worker", "rtmanager.containercleanup"),
|
|
}, nil
|
|
}
|
|
|
|
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
|
|
// are absorbed; the loop only exits on context cancellation.
|
|
func (worker *Worker) Run(ctx context.Context) error {
|
|
if worker == nil {
|
|
return errors.New("run container cleanup worker: nil worker")
|
|
}
|
|
if ctx == nil {
|
|
return errors.New("run container cleanup worker: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
worker.logger.Info("container cleanup worker started",
|
|
"interval", worker.interval.String(),
|
|
"retention", worker.retention.String(),
|
|
)
|
|
defer worker.logger.Info("container cleanup worker stopped")
|
|
|
|
ticker := time.NewTicker(worker.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
worker.tick(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown is a no-op; Run terminates on context cancellation.
|
|
func (worker *Worker) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown container cleanup worker: nil context")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Tick performs one cleanup pass. Exported so tests can drive the
|
|
// worker deterministically without spinning a real ticker.
|
|
func (worker *Worker) Tick(ctx context.Context) {
|
|
worker.tick(ctx)
|
|
}
|
|
|
|
// tick lists stopped records and delegates removal of expired ones to
|
|
// the cleanup service.
|
|
func (worker *Worker) tick(ctx context.Context) {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
|
|
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
|
|
if err != nil {
|
|
worker.logger.WarnContext(ctx, "list stopped records",
|
|
"err", err.Error(),
|
|
)
|
|
return
|
|
}
|
|
|
|
threshold := worker.clock().Add(-worker.retention)
|
|
for _, record := range records {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
if !record.LastOpAt.Before(threshold) {
|
|
continue
|
|
}
|
|
|
|
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
|
|
GameID: record.GameID,
|
|
OpSource: operation.OpSourceAutoTTL,
|
|
})
|
|
if err != nil {
|
|
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
|
|
"game_id", record.GameID,
|
|
"err", err.Error(),
|
|
)
|
|
continue
|
|
}
|
|
if result.Outcome == operation.OutcomeFailure {
|
|
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
|
|
"game_id", record.GameID,
|
|
"error_code", result.ErrorCode,
|
|
"error_message", result.ErrorMessage,
|
|
)
|
|
continue
|
|
}
|
|
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
|
|
"game_id", record.GameID,
|
|
"error_code", result.ErrorCode,
|
|
)
|
|
}
|
|
}
|