feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,204 @@
// Package containercleanup ships the periodic TTL-cleanup worker
// described in `rtmanager/README.md §Lifecycles → Cleanup`.
//
// On every tick the worker lists `runtime_records.status='stopped'`
// rows whose `last_op_at` is older than the configured retention
// (`RTMANAGER_CONTAINER_RETENTION_DAYS`) and delegates removal to
// `cleanupcontainer.Service.Handle` with `op_source=auto_ttl`. The
// service owns the per-game lease, the Docker `Remove` call, the
// status transition, the telemetry counter, and the operation_log
// entry; this worker is intentionally tiny — a ticker plus a TTL
// filter.
//
// Idempotent outcomes (`replay_no_op`, `conflict`) are absorbed; a
// failure on one game does not abort the rest of the pass.
//
// Design rationale is captured in
// `rtmanager/docs/workers.md`.
package containercleanup
import (
"context"
"errors"
"log/slog"
"time"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/cleanupcontainer"
)
// Cleaner is the narrow surface the worker uses to remove stopped
// containers. The production `*cleanupcontainer.Service` satisfies
// this interface verbatim; the package keeps the surface here so
// tests can substitute a fake without spinning the full service.
type Cleaner interface {
Handle(ctx context.Context, input cleanupcontainer.Input) (cleanupcontainer.Result, error)
}
// Dependencies groups the collaborators required by Worker.
type Dependencies struct {
// RuntimeRecords lists `status=stopped` records on every tick.
RuntimeRecords ports.RuntimeRecordStore
// Cleanup performs the actual container removal under the per-game
// lease.
Cleanup Cleaner
// Retention is the TTL after which a stopped container becomes a
// removal candidate. Mirrors `cfg.Container.Retention`.
Retention time.Duration
// Interval bounds the tick period. Mirrors
// `cfg.Cleanup.CleanupInterval`.
Interval time.Duration
// Clock supplies the wall-clock used to compute the TTL threshold.
// Defaults to `time.Now` when nil.
Clock func() time.Time
// Logger receives structured worker-level events. Defaults to
// `slog.Default()` when nil.
Logger *slog.Logger
}
// Worker drives the periodic TTL-cleanup loop.
type Worker struct {
runtimeRecords ports.RuntimeRecordStore
cleanup Cleaner
retention time.Duration
interval time.Duration
clock func() time.Time
logger *slog.Logger
}
// NewWorker constructs one Worker from deps.
func NewWorker(deps Dependencies) (*Worker, error) {
switch {
case deps.RuntimeRecords == nil:
return nil, errors.New("new container cleanup worker: nil runtime records store")
case deps.Cleanup == nil:
return nil, errors.New("new container cleanup worker: nil cleanup service")
case deps.Retention <= 0:
return nil, errors.New("new container cleanup worker: retention must be positive")
case deps.Interval <= 0:
return nil, errors.New("new container cleanup worker: interval must be positive")
}
clock := deps.Clock
if clock == nil {
clock = time.Now
}
logger := deps.Logger
if logger == nil {
logger = slog.Default()
}
return &Worker{
runtimeRecords: deps.RuntimeRecords,
cleanup: deps.Cleanup,
retention: deps.Retention,
interval: deps.Interval,
clock: clock,
logger: logger.With("worker", "rtmanager.containercleanup"),
}, nil
}
// Run drives the cleanup loop until ctx is cancelled. Per-tick errors
// are absorbed; the loop only exits on context cancellation.
func (worker *Worker) Run(ctx context.Context) error {
if worker == nil {
return errors.New("run container cleanup worker: nil worker")
}
if ctx == nil {
return errors.New("run container cleanup worker: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
worker.logger.Info("container cleanup worker started",
"interval", worker.interval.String(),
"retention", worker.retention.String(),
)
defer worker.logger.Info("container cleanup worker stopped")
ticker := time.NewTicker(worker.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
worker.tick(ctx)
}
}
}
// Shutdown is a no-op; Run terminates on context cancellation.
func (worker *Worker) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown container cleanup worker: nil context")
}
return nil
}
// Tick performs one cleanup pass. Exported so tests can drive the
// worker deterministically without spinning a real ticker.
func (worker *Worker) Tick(ctx context.Context) {
worker.tick(ctx)
}
// tick lists stopped records and delegates removal of expired ones to
// the cleanup service.
func (worker *Worker) tick(ctx context.Context) {
if err := ctx.Err(); err != nil {
return
}
records, err := worker.runtimeRecords.ListByStatus(ctx, runtime.StatusStopped)
if err != nil {
worker.logger.WarnContext(ctx, "list stopped records",
"err", err.Error(),
)
return
}
threshold := worker.clock().Add(-worker.retention)
for _, record := range records {
if err := ctx.Err(); err != nil {
return
}
if !record.LastOpAt.Before(threshold) {
continue
}
result, err := worker.cleanup.Handle(ctx, cleanupcontainer.Input{
GameID: record.GameID,
OpSource: operation.OpSourceAutoTTL,
})
if err != nil {
worker.logger.ErrorContext(ctx, "cleanup handle returned error",
"game_id", record.GameID,
"err", err.Error(),
)
continue
}
if result.Outcome == operation.OutcomeFailure {
worker.logger.InfoContext(ctx, "cleanup ttl pass: failure outcome",
"game_id", record.GameID,
"error_code", result.ErrorCode,
"error_message", result.ErrorMessage,
)
continue
}
worker.logger.InfoContext(ctx, "cleanup ttl removed container",
"game_id", record.GameID,
"error_code", result.ErrorCode,
)
}
}