package app import ( "context" "database/sql" "errors" "fmt" "log/slog" "time" "galaxy/postgres" "galaxy/redisconn" "galaxy/rtmanager/internal/adapters/postgres/migrations" "galaxy/rtmanager/internal/api/internalhttp" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/telemetry" dockerclient "github.com/docker/docker/client" "github.com/redis/go-redis/v9" ) // Runtime owns the runnable Runtime Manager process plus the cleanup // functions that release runtime resources after shutdown. type Runtime struct { cfg config.Config app *App wiring *wiring internalServer *internalhttp.Server cleanupFns []func() error } // NewRuntime constructs the runnable Runtime Manager process from cfg. // // PostgreSQL migrations apply strictly before the internal HTTP listener // becomes ready. The runtime opens one shared `*redis.Client`, one // `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all // are released in reverse construction order on shutdown. func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) { if ctx == nil { return nil, errors.New("new rtmanager runtime: nil context") } if err := cfg.Validate(); err != nil { return nil, fmt.Errorf("new rtmanager runtime: %w", err) } if logger == nil { logger = slog.Default() } runtime := &Runtime{ cfg: cfg, } cleanupOnError := func(err error) (*Runtime, error) { if cleanupErr := runtime.Close(); cleanupErr != nil { return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr) } return nil, err } telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{ ServiceName: cfg.Telemetry.ServiceName, TracesExporter: cfg.Telemetry.TracesExporter, MetricsExporter: cfg.Telemetry.MetricsExporter, TracesProtocol: cfg.Telemetry.TracesProtocol, MetricsProtocol: cfg.Telemetry.MetricsProtocol, StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled, StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled, }, logger) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err)) } runtime.cleanupFns = append(runtime.cleanupFns, func() error { shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout) defer cancel() return telemetryRuntime.Shutdown(shutdownCtx) }) redisClient := newRedisClient(cfg.Redis) if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) } runtime.cleanupFns = append(runtime.cleanupFns, func() error { err := redisClient.Close() if errors.Is(err, redis.ErrClosed) { return nil } return err }) if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) } pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn, postgres.WithTracerProvider(telemetryRuntime.TracerProvider()), postgres.WithMeterProvider(telemetryRuntime.MeterProvider()), ) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err)) } runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close) unregisterPGStats, err := postgres.InstrumentDBStats(pgPool, postgres.WithMeterProvider(telemetryRuntime.MeterProvider()), ) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err)) } runtime.cleanupFns = append(runtime.cleanupFns, func() error { return unregisterPGStats() }) if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err)) } if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err)) } dockerClient, err := newDockerClient(cfg.Docker) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) } runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close) if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err)) } wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err)) } runtime.wiring = wiring runtime.cleanupFns = append(runtime.cleanupFns, wiring.close) if err := wiring.registerTelemetryGauges(); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err)) } if err := wiring.reconciler.ReconcileNow(ctx); err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err)) } probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg) internalServer, err := internalhttp.NewServer(internalhttp.Config{ Addr: cfg.InternalHTTP.Addr, ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout, ReadTimeout: cfg.InternalHTTP.ReadTimeout, WriteTimeout: cfg.InternalHTTP.WriteTimeout, IdleTimeout: cfg.InternalHTTP.IdleTimeout, }, internalhttp.Dependencies{ Logger: logger, Telemetry: telemetryRuntime, Readiness: probe, RuntimeRecords: wiring.runtimeRecordStore, StartRuntime: wiring.startRuntimeService, StopRuntime: wiring.stopRuntimeService, RestartRuntime: wiring.restartRuntimeService, PatchRuntime: wiring.patchRuntimeService, CleanupContainer: wiring.cleanupContainerService, }) if err != nil { return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err)) } runtime.internalServer = internalServer runtime.app = New(cfg, internalServer, wiring.startJobsConsumer, wiring.stopJobsConsumer, wiring.dockerEventsListener, wiring.healthProbeWorker, wiring.dockerInspectWorker, wiring.reconciler, wiring.containerCleanupWorker, ) return runtime, nil } // InternalServer returns the internal HTTP server owned by runtime. It is // primarily exposed for tests; production code should not depend on it. func (runtime *Runtime) InternalServer() *internalhttp.Server { if runtime == nil { return nil } return runtime.internalServer } // Run serves the internal HTTP listener until ctx is canceled or one // component fails. func (runtime *Runtime) Run(ctx context.Context) error { if ctx == nil { return errors.New("run rtmanager runtime: nil context") } if runtime == nil { return errors.New("run rtmanager runtime: nil runtime") } if runtime.app == nil { return errors.New("run rtmanager runtime: nil app") } return runtime.app.Run(ctx) } // Close releases every runtime dependency in reverse construction order. // Close is safe to call multiple times. func (runtime *Runtime) Close() error { if runtime == nil { return nil } var joined error for index := len(runtime.cleanupFns) - 1; index >= 0; index-- { if err := runtime.cleanupFns[index](); err != nil { joined = errors.Join(joined, err) } } runtime.cleanupFns = nil return joined } // readinessProbe pings every steady-state dependency the listener // guards: PostgreSQL primary, Redis master, the Docker daemon, plus // the configured Docker network's existence. type readinessProbe struct { pgPool *sql.DB redisClient *redis.Client dockerClient *dockerclient.Client postgresTimeout time.Duration redisTimeout time.Duration dockerTimeout time.Duration } func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe { return &readinessProbe{ pgPool: pgPool, redisClient: redisClient, dockerClient: dockerClient, postgresTimeout: cfg.Postgres.Conn.OperationTimeout, redisTimeout: cfg.Redis.Conn.OperationTimeout, dockerTimeout: cfg.Postgres.Conn.OperationTimeout, } } // Check pings PostgreSQL, Redis, and Docker. The first failing // dependency aborts the check so callers see a single, actionable // error. func (probe *readinessProbe) Check(ctx context.Context) error { if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil { return err } if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil { return err } return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout) }