feat: runtime manager
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
// Package containercleanup ships the periodic TTL-cleanup worker
|
||||
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
|
||||
//
|
||||
// On every tick the worker lists `runtime_records.status='stopped'`
|
||||
// rows whose `last_op_at` is older than the configured retention
|
||||
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
|
||||
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
|
||||
// service owns the per-game lease, the Docker `Remove` call, the
|
||||
// status transition, the telemetry counter, and the operation_log
|
||||
// entry; this worker is intentionally tiny — a ticker plus a TTL
|
||||
// filter.
|
||||
//
|
||||
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
|
||||
// failure on one game does not abort the rest of the pass.
|
||||
//
|
||||
// Design rationale is captured in
|
||||
// `rtmanager/docs/workers.md`.
|
||||
package containercleanup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
)
|
||||
|
||||
// Cleaner is the narrow surface the worker uses to remove stopped
|
||||
// containers. The production `*cleanupcontainer.Service` satisfies
|
||||
// this interface verbatim; the package keeps the surface here so
|
||||
// tests can substitute a fake without spinning the full service.
|
||||
type Cleaner interface {
|
||||
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Worker.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords lists `status=stopped` records on every tick.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// Cleanup performs the actual container removal under the per-game
|
||||
// lease.
|
||||
Cleanup Cleaner
|
||||
|
||||
// Retention is the TTL after which a stopped container becomes a
|
||||
// removal candidate. Mirrors `cfg.Container.Retention`.
|
||||
Retention time.Duration
|
||||
|
||||
// Interval bounds the tick period. Mirrors
|
||||
// `cfg.Cleanup.CleanupInterval`.
|
||||
Interval time.Duration
|
||||
|
||||
// Clock supplies the wall-clock used to compute the TTL threshold.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// Logger receives structured worker-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Worker drives the periodic TTL-cleanup loop.
|
||||
type Worker struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
cleanup Cleaner
|
||||
|
||||
retention time.Duration
|
||||
interval time.Duration
|
||||
|
||||
clock func() time.Time
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewWorker constructs one Worker from deps.
|
||||
func NewWorker(deps Dependencies) (*Worker, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil runtime records store")
|
||||
case deps.Cleanup == nil:
|
||||
return nil, errors.New("new container cleanup worker: nil cleanup service")
|
||||
case deps.Retention <= 0:
|
||||
return nil, errors.New("new container cleanup worker: retention must be positive")
|
||||
case deps.Interval <= 0:
|
||||
return nil, errors.New("new container cleanup worker: interval must be positive")
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Worker{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
cleanup: deps.Cleanup,
|
||||
retention: deps.Retention,
|
||||
interval: deps.Interval,
|
||||
clock: clock,
|
||||
logger: logger.With("worker", "rtmanager.containercleanup"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
|
||||
// are absorbed; the loop only exits on context cancellation.
|
||||
func (worker *Worker) Run(ctx context.Context) error {
|
||||
if worker == nil {
|
||||
return errors.New("run container cleanup worker: nil worker")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("run container cleanup worker: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
worker.logger.Info("container cleanup worker started",
|
||||
"interval", worker.interval.String(),
|
||||
"retention", worker.retention.String(),
|
||||
)
|
||||
defer worker.logger.Info("container cleanup worker stopped")
|
||||
|
||||
ticker := time.NewTicker(worker.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
worker.tick(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown is a no-op; Run terminates on context cancellation.
|
||||
func (worker *Worker) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown container cleanup worker: nil context")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tick performs one cleanup pass. Exported so tests can drive the
|
||||
// worker deterministically without spinning a real ticker.
|
||||
func (worker *Worker) Tick(ctx context.Context) {
|
||||
worker.tick(ctx)
|
||||
}
|
||||
|
||||
// tick lists stopped records and delegates removal of expired ones to
|
||||
// the cleanup service.
|
||||
func (worker *Worker) tick(ctx context.Context) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
|
||||
if err != nil {
|
||||
worker.logger.WarnContext(ctx, "list stopped records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
threshold := worker.clock().Add(-worker.retention)
|
||||
for _, record := range records {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return
|
||||
}
|
||||
if !record.LastOpAt.Before(threshold) {
|
||||
continue
|
||||
}
|
||||
|
||||
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
|
||||
GameID: record.GameID,
|
||||
OpSource: operation.OpSourceAutoTTL,
|
||||
})
|
||||
if err != nil {
|
||||
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
|
||||
"game_id", record.GameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
"error_message", result.ErrorMessage,
|
||||
)
|
||||
continue
|
||||
}
|
||||
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
|
||||
"game_id", record.GameID,
|
||||
"error_code", result.ErrorCode,
|
||||
)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user