feat: runtime manager
This commit is contained in:
@@ -0,0 +1,262 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/migrations"
|
||||
"galaxy/rtmanager/internal/api/internalhttp"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Runtime owns the runnable Runtime Manager process plus the cleanup
|
||||
// functions that release runtime resources after shutdown.
|
||||
type Runtime struct {
|
||||
cfg config.Config
|
||||
|
||||
app *App
|
||||
|
||||
wiring *wiring
|
||||
|
||||
internalServer *internalhttp.Server
|
||||
|
||||
cleanupFns []func() error
|
||||
}
|
||||
|
||||
// NewRuntime constructs the runnable Runtime Manager process from cfg.
|
||||
//
|
||||
// PostgreSQL migrations apply strictly before the internal HTTP listener
|
||||
// becomes ready. The runtime opens one shared `*redis.Client`, one
|
||||
// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all
|
||||
// are released in reverse construction order on shutdown.
|
||||
func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new rtmanager runtime: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager runtime: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
runtime := &Runtime{
|
||||
cfg: cfg,
|
||||
}
|
||||
|
||||
cleanupOnError := func(err error) (*Runtime, error) {
|
||||
if cleanupErr := runtime.Close(); cleanupErr != nil {
|
||||
return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr)
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{
|
||||
ServiceName: cfg.Telemetry.ServiceName,
|
||||
TracesExporter: cfg.Telemetry.TracesExporter,
|
||||
MetricsExporter: cfg.Telemetry.MetricsExporter,
|
||||
TracesProtocol: cfg.Telemetry.TracesProtocol,
|
||||
MetricsProtocol: cfg.Telemetry.MetricsProtocol,
|
||||
StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled,
|
||||
StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled,
|
||||
}, logger)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout)
|
||||
defer cancel()
|
||||
return telemetryRuntime.Shutdown(shutdownCtx)
|
||||
})
|
||||
|
||||
redisClient := newRedisClient(cfg.Redis)
|
||||
if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
err := redisClient.Close()
|
||||
if errors.Is(err, redis.ErrClosed) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
})
|
||||
if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
|
||||
pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn,
|
||||
postgres.WithTracerProvider(telemetryRuntime.TracerProvider()),
|
||||
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close)
|
||||
unregisterPGStats, err := postgres.InstrumentDBStats(pgPool,
|
||||
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
return unregisterPGStats()
|
||||
})
|
||||
if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err))
|
||||
}
|
||||
if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err))
|
||||
}
|
||||
|
||||
dockerClient, err := newDockerClient(cfg.Docker)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close)
|
||||
if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
|
||||
wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err))
|
||||
}
|
||||
runtime.wiring = wiring
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, wiring.close)
|
||||
if err := wiring.registerTelemetryGauges(); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err))
|
||||
}
|
||||
|
||||
if err := wiring.reconciler.ReconcileNow(ctx); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err))
|
||||
}
|
||||
|
||||
probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg)
|
||||
|
||||
internalServer, err := internalhttp.NewServer(internalhttp.Config{
|
||||
Addr: cfg.InternalHTTP.Addr,
|
||||
ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout,
|
||||
ReadTimeout: cfg.InternalHTTP.ReadTimeout,
|
||||
WriteTimeout: cfg.InternalHTTP.WriteTimeout,
|
||||
IdleTimeout: cfg.InternalHTTP.IdleTimeout,
|
||||
}, internalhttp.Dependencies{
|
||||
Logger: logger,
|
||||
Telemetry: telemetryRuntime,
|
||||
Readiness: probe,
|
||||
RuntimeRecords: wiring.runtimeRecordStore,
|
||||
StartRuntime: wiring.startRuntimeService,
|
||||
StopRuntime: wiring.stopRuntimeService,
|
||||
RestartRuntime: wiring.restartRuntimeService,
|
||||
PatchRuntime: wiring.patchRuntimeService,
|
||||
CleanupContainer: wiring.cleanupContainerService,
|
||||
})
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err))
|
||||
}
|
||||
runtime.internalServer = internalServer
|
||||
|
||||
runtime.app = New(cfg,
|
||||
internalServer,
|
||||
wiring.startJobsConsumer,
|
||||
wiring.stopJobsConsumer,
|
||||
wiring.dockerEventsListener,
|
||||
wiring.healthProbeWorker,
|
||||
wiring.dockerInspectWorker,
|
||||
wiring.reconciler,
|
||||
wiring.containerCleanupWorker,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// InternalServer returns the internal HTTP server owned by runtime. It is
|
||||
// primarily exposed for tests; production code should not depend on it.
|
||||
func (runtime *Runtime) InternalServer() *internalhttp.Server {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return runtime.internalServer
|
||||
}
|
||||
|
||||
// Run serves the internal HTTP listener until ctx is canceled or one
|
||||
// component fails.
|
||||
func (runtime *Runtime) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run rtmanager runtime: nil context")
|
||||
}
|
||||
if runtime == nil {
|
||||
return errors.New("run rtmanager runtime: nil runtime")
|
||||
}
|
||||
if runtime.app == nil {
|
||||
return errors.New("run rtmanager runtime: nil app")
|
||||
}
|
||||
|
||||
return runtime.app.Run(ctx)
|
||||
}
|
||||
|
||||
// Close releases every runtime dependency in reverse construction order.
|
||||
// Close is safe to call multiple times.
|
||||
func (runtime *Runtime) Close() error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var joined error
|
||||
for index := len(runtime.cleanupFns) - 1; index >= 0; index-- {
|
||||
if err := runtime.cleanupFns[index](); err != nil {
|
||||
joined = errors.Join(joined, err)
|
||||
}
|
||||
}
|
||||
runtime.cleanupFns = nil
|
||||
|
||||
return joined
|
||||
}
|
||||
|
||||
// readinessProbe pings every steady-state dependency the listener
|
||||
// guards: PostgreSQL primary, Redis master, the Docker daemon, plus
|
||||
// the configured Docker network's existence.
|
||||
type readinessProbe struct {
|
||||
pgPool *sql.DB
|
||||
redisClient *redis.Client
|
||||
dockerClient *dockerclient.Client
|
||||
|
||||
postgresTimeout time.Duration
|
||||
redisTimeout time.Duration
|
||||
dockerTimeout time.Duration
|
||||
}
|
||||
|
||||
func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe {
|
||||
return &readinessProbe{
|
||||
pgPool: pgPool,
|
||||
redisClient: redisClient,
|
||||
dockerClient: dockerClient,
|
||||
postgresTimeout: cfg.Postgres.Conn.OperationTimeout,
|
||||
redisTimeout: cfg.Redis.Conn.OperationTimeout,
|
||||
dockerTimeout: cfg.Postgres.Conn.OperationTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// Check pings PostgreSQL, Redis, and Docker. The first failing
|
||||
// dependency aborts the check so callers see a single, actionable
|
||||
// error.
|
||||
func (probe *readinessProbe) Check(ctx context.Context) error {
|
||||
if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout)
|
||||
}
|
||||
Reference in New Issue
Block a user