542 lines
17 KiB
Go
542 lines
17 KiB
Go
package app
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"time"
|
|
|
|
"galaxy/rtmanager/internal/adapters/docker"
|
|
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
|
|
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
|
|
"galaxy/rtmanager/internal/adapters/lobbyclient"
|
|
"galaxy/rtmanager/internal/adapters/notificationpublisher"
|
|
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
|
|
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
|
|
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
|
|
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
|
|
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
|
|
"galaxy/rtmanager/internal/config"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
|
"galaxy/rtmanager/internal/service/patchruntime"
|
|
"galaxy/rtmanager/internal/service/restartruntime"
|
|
"galaxy/rtmanager/internal/service/startruntime"
|
|
"galaxy/rtmanager/internal/service/stopruntime"
|
|
"galaxy/rtmanager/internal/telemetry"
|
|
"galaxy/rtmanager/internal/worker/containercleanup"
|
|
"galaxy/rtmanager/internal/worker/dockerevents"
|
|
"galaxy/rtmanager/internal/worker/dockerinspect"
|
|
"galaxy/rtmanager/internal/worker/healthprobe"
|
|
"galaxy/rtmanager/internal/worker/reconcile"
|
|
"galaxy/rtmanager/internal/worker/startjobsconsumer"
|
|
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
|
|
|
|
dockerclient "github.com/docker/docker/client"
|
|
"github.com/redis/go-redis/v9"
|
|
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
|
)
|
|
|
|
// wiring owns the process-level singletons constructed once during
|
|
// `NewRuntime` and consumed by every worker and HTTP handler.
|
|
//
|
|
// The struct exposes typed accessors so callers can grab the store /
|
|
// adapter / service singletons without depending on internal fields.
|
|
type wiring struct {
|
|
cfg config.Config
|
|
|
|
redisClient *redis.Client
|
|
pgPool *sql.DB
|
|
dockerClient *dockerclient.Client
|
|
|
|
clock func() time.Time
|
|
|
|
logger *slog.Logger
|
|
telemetry *telemetry.Runtime
|
|
|
|
// Persistence stores.
|
|
runtimeRecordStore *runtimerecordstore.Store
|
|
operationLogStore *operationlogstore.Store
|
|
healthSnapshotStore *healthsnapshotstore.Store
|
|
streamOffsetStore *streamoffsets.Store
|
|
gameLeaseStore *gamelease.Store
|
|
|
|
// External adapters.
|
|
dockerAdapter *docker.Client
|
|
lobbyClient *lobbyclient.Client
|
|
notificationPublisher *notificationpublisher.Publisher
|
|
healthEventsPublisher *healtheventspublisher.Publisher
|
|
jobResultsPublisher *jobresultspublisher.Publisher
|
|
|
|
// Service layer.
|
|
startRuntimeService *startruntime.Service
|
|
stopRuntimeService *stopruntime.Service
|
|
restartRuntimeService *restartruntime.Service
|
|
patchRuntimeService *patchruntime.Service
|
|
cleanupContainerService *cleanupcontainer.Service
|
|
|
|
// Worker layer.
|
|
startJobsConsumer *startjobsconsumer.Consumer
|
|
stopJobsConsumer *stopjobsconsumer.Consumer
|
|
dockerEventsListener *dockerevents.Listener
|
|
healthProbeWorker *healthprobe.Worker
|
|
dockerInspectWorker *dockerinspect.Worker
|
|
reconciler *reconcile.Reconciler
|
|
containerCleanupWorker *containercleanup.Worker
|
|
|
|
// closers releases adapter-level resources at runtime shutdown.
|
|
closers []func() error
|
|
}
|
|
|
|
// newWiring constructs the process-level dependency set, the persistence
|
|
// stores, the external adapters, and the service layer. It validates
|
|
// every required collaborator so callers can rely on them being non-nil.
|
|
func newWiring(
|
|
cfg config.Config,
|
|
redisClient *redis.Client,
|
|
pgPool *sql.DB,
|
|
dockerClient *dockerclient.Client,
|
|
clock func() time.Time,
|
|
logger *slog.Logger,
|
|
telemetryRuntime *telemetry.Runtime,
|
|
) (*wiring, error) {
|
|
if redisClient == nil {
|
|
return nil, errors.New("new rtmanager wiring: nil redis client")
|
|
}
|
|
if pgPool == nil {
|
|
return nil, errors.New("new rtmanager wiring: nil postgres pool")
|
|
}
|
|
if dockerClient == nil {
|
|
return nil, errors.New("new rtmanager wiring: nil docker client")
|
|
}
|
|
if clock == nil {
|
|
clock = time.Now
|
|
}
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
if telemetryRuntime == nil {
|
|
return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime")
|
|
}
|
|
|
|
w := &wiring{
|
|
cfg: cfg,
|
|
redisClient: redisClient,
|
|
pgPool: pgPool,
|
|
dockerClient: dockerClient,
|
|
clock: clock,
|
|
logger: logger,
|
|
telemetry: telemetryRuntime,
|
|
}
|
|
|
|
if err := w.buildPersistence(); err != nil {
|
|
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
|
}
|
|
if err := w.buildAdapters(); err != nil {
|
|
_ = w.close()
|
|
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
|
}
|
|
if err := w.buildServices(); err != nil {
|
|
_ = w.close()
|
|
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
|
}
|
|
if err := w.buildWorkers(); err != nil {
|
|
_ = w.close()
|
|
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
|
}
|
|
return w, nil
|
|
}
|
|
|
|
func (w *wiring) buildPersistence() error {
|
|
runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{
|
|
DB: w.pgPool,
|
|
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("runtime record store: %w", err)
|
|
}
|
|
w.runtimeRecordStore = runtimeStore
|
|
|
|
operationStore, err := operationlogstore.New(operationlogstore.Config{
|
|
DB: w.pgPool,
|
|
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("operation log store: %w", err)
|
|
}
|
|
w.operationLogStore = operationStore
|
|
|
|
snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{
|
|
DB: w.pgPool,
|
|
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("health snapshot store: %w", err)
|
|
}
|
|
w.healthSnapshotStore = snapshotStore
|
|
|
|
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient})
|
|
if err != nil {
|
|
return fmt.Errorf("stream offset store: %w", err)
|
|
}
|
|
w.streamOffsetStore = offsetStore
|
|
|
|
leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient})
|
|
if err != nil {
|
|
return fmt.Errorf("game lease store: %w", err)
|
|
}
|
|
w.gameLeaseStore = leaseStore
|
|
|
|
return nil
|
|
}
|
|
|
|
func (w *wiring) buildAdapters() error {
|
|
dockerAdapter, err := docker.NewClient(docker.Config{
|
|
Docker: w.dockerClient,
|
|
LogDriver: w.cfg.Docker.LogDriver,
|
|
LogOpts: w.cfg.Docker.LogOpts,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("docker adapter: %w", err)
|
|
}
|
|
w.dockerAdapter = dockerAdapter
|
|
|
|
lobby, err := lobbyclient.NewClient(lobbyclient.Config{
|
|
BaseURL: w.cfg.Lobby.BaseURL,
|
|
RequestTimeout: w.cfg.Lobby.Timeout,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("lobby client: %w", err)
|
|
}
|
|
w.lobbyClient = lobby
|
|
w.closers = append(w.closers, lobby.Close)
|
|
|
|
notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{
|
|
Client: w.redisClient,
|
|
Stream: w.cfg.Streams.NotificationIntents,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("notification publisher: %w", err)
|
|
}
|
|
w.notificationPublisher = notificationPub
|
|
|
|
healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
|
|
Client: w.redisClient,
|
|
Snapshots: w.healthSnapshotStore,
|
|
Stream: w.cfg.Streams.HealthEvents,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("health events publisher: %w", err)
|
|
}
|
|
w.healthEventsPublisher = healthPub
|
|
|
|
jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
|
Client: w.redisClient,
|
|
Stream: w.cfg.Streams.JobResults,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("job results publisher: %w", err)
|
|
}
|
|
w.jobResultsPublisher = jobResultsPub
|
|
|
|
return nil
|
|
}
|
|
|
|
func (w *wiring) buildServices() error {
|
|
startService, err := startruntime.NewService(startruntime.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
Docker: w.dockerAdapter,
|
|
Leases: w.gameLeaseStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
Notifications: w.notificationPublisher,
|
|
Lobby: w.lobbyClient,
|
|
Container: w.cfg.Container,
|
|
DockerCfg: w.cfg.Docker,
|
|
Coordination: w.cfg.Coordination,
|
|
Telemetry: w.telemetry,
|
|
Logger: w.logger,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("start runtime service: %w", err)
|
|
}
|
|
w.startRuntimeService = startService
|
|
|
|
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
Docker: w.dockerAdapter,
|
|
Leases: w.gameLeaseStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
Container: w.cfg.Container,
|
|
Coordination: w.cfg.Coordination,
|
|
Telemetry: w.telemetry,
|
|
Logger: w.logger,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("stop runtime service: %w", err)
|
|
}
|
|
w.stopRuntimeService = stopService
|
|
|
|
restartService, err := restartruntime.NewService(restartruntime.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
Docker: w.dockerAdapter,
|
|
Leases: w.gameLeaseStore,
|
|
StopService: stopService,
|
|
StartService: startService,
|
|
Coordination: w.cfg.Coordination,
|
|
Telemetry: w.telemetry,
|
|
Logger: w.logger,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("restart runtime service: %w", err)
|
|
}
|
|
w.restartRuntimeService = restartService
|
|
|
|
patchService, err := patchruntime.NewService(patchruntime.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
Docker: w.dockerAdapter,
|
|
Leases: w.gameLeaseStore,
|
|
StopService: stopService,
|
|
StartService: startService,
|
|
Coordination: w.cfg.Coordination,
|
|
Telemetry: w.telemetry,
|
|
Logger: w.logger,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("patch runtime service: %w", err)
|
|
}
|
|
w.patchRuntimeService = patchService
|
|
|
|
cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
Docker: w.dockerAdapter,
|
|
Leases: w.gameLeaseStore,
|
|
Coordination: w.cfg.Coordination,
|
|
Telemetry: w.telemetry,
|
|
Logger: w.logger,
|
|
Clock: w.clock,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("cleanup container service: %w", err)
|
|
}
|
|
w.cleanupContainerService = cleanupService
|
|
|
|
return nil
|
|
}
|
|
|
|
// buildWorkers constructs the asynchronous Lobby ↔ RTM stream
|
|
// consumers. Both consumers participate in the process lifecycle as
|
|
// `app.Component`s; `internal/app/runtime.go` passes them into
|
|
// `app.New` alongside the internal HTTP server.
|
|
func (w *wiring) buildWorkers() error {
|
|
startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
|
Client: w.redisClient,
|
|
Stream: w.cfg.Streams.StartJobs,
|
|
BlockTimeout: w.cfg.Streams.BlockTimeout,
|
|
StartService: w.startRuntimeService,
|
|
JobResults: w.jobResultsPublisher,
|
|
OffsetStore: w.streamOffsetStore,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("start jobs consumer: %w", err)
|
|
}
|
|
w.startJobsConsumer = startConsumer
|
|
|
|
stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
|
|
Client: w.redisClient,
|
|
Stream: w.cfg.Streams.StopJobs,
|
|
BlockTimeout: w.cfg.Streams.BlockTimeout,
|
|
StopService: w.stopRuntimeService,
|
|
JobResults: w.jobResultsPublisher,
|
|
OffsetStore: w.streamOffsetStore,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("stop jobs consumer: %w", err)
|
|
}
|
|
w.stopJobsConsumer = stopConsumer
|
|
|
|
eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{
|
|
Docker: w.dockerAdapter,
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
Telemetry: w.telemetry,
|
|
Clock: w.clock,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("docker events listener: %w", err)
|
|
}
|
|
w.dockerEventsListener = eventsListener
|
|
|
|
probeHTTPClient, err := newProbeHTTPClient(w.telemetry)
|
|
if err != nil {
|
|
return fmt.Errorf("health probe http client: %w", err)
|
|
}
|
|
probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
HTTPClient: probeHTTPClient,
|
|
Telemetry: w.telemetry,
|
|
Interval: w.cfg.Health.ProbeInterval,
|
|
ProbeTimeout: w.cfg.Health.ProbeTimeout,
|
|
FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold,
|
|
Clock: w.clock,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("health probe worker: %w", err)
|
|
}
|
|
w.healthProbeWorker = probeWorker
|
|
|
|
inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
|
|
Docker: w.dockerAdapter,
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
Telemetry: w.telemetry,
|
|
Interval: w.cfg.Health.InspectInterval,
|
|
Clock: w.clock,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("docker inspect worker: %w", err)
|
|
}
|
|
w.dockerInspectWorker = inspectWorker
|
|
|
|
reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{
|
|
Docker: w.dockerAdapter,
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
OperationLogs: w.operationLogStore,
|
|
HealthEvents: w.healthEventsPublisher,
|
|
Leases: w.gameLeaseStore,
|
|
Telemetry: w.telemetry,
|
|
DockerCfg: w.cfg.Docker,
|
|
ContainerCfg: w.cfg.Container,
|
|
Coordination: w.cfg.Coordination,
|
|
Interval: w.cfg.Cleanup.ReconcileInterval,
|
|
Clock: w.clock,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("reconciler: %w", err)
|
|
}
|
|
w.reconciler = reconciler
|
|
|
|
cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
|
RuntimeRecords: w.runtimeRecordStore,
|
|
Cleanup: w.cleanupContainerService,
|
|
Retention: w.cfg.Container.Retention,
|
|
Interval: w.cfg.Cleanup.CleanupInterval,
|
|
Clock: w.clock,
|
|
Logger: w.logger,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("container cleanup worker: %w", err)
|
|
}
|
|
w.containerCleanupWorker = cleanupWorker
|
|
|
|
return nil
|
|
}
|
|
|
|
// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client
|
|
// the active health probe uses to call engine `/healthz`. It clones
|
|
// the default transport so caller-provided transports stay isolated
|
|
// from production wiring (mirrors the lobby internal client).
|
|
func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) {
|
|
transport, ok := http.DefaultTransport.(*http.Transport)
|
|
if !ok {
|
|
return nil, errors.New("default http transport is not *http.Transport")
|
|
}
|
|
cloned := transport.Clone()
|
|
instrumented := otelhttp.NewTransport(cloned,
|
|
otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()),
|
|
otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
|
)
|
|
return &http.Client{Transport: instrumented}, nil
|
|
}
|
|
|
|
// registerTelemetryGauges installs the runtime-records-by-status gauge
|
|
// callback so the telemetry runtime can observe the persistent store
|
|
// without holding a strong reference to the wiring.
|
|
func (w *wiring) registerTelemetryGauges() error {
|
|
probe := newRuntimeRecordsProbe(w.runtimeRecordStore)
|
|
return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{
|
|
RuntimeRecordsByStatus: probe,
|
|
Logger: w.logger,
|
|
})
|
|
}
|
|
|
|
// close releases adapter-level resources owned by the wiring layer.
|
|
// Returns the joined error of every closer; the caller is expected to
|
|
// invoke this once during process shutdown.
|
|
func (w *wiring) close() error {
|
|
var joined error
|
|
for index := len(w.closers) - 1; index >= 0; index-- {
|
|
if err := w.closers[index](); err != nil {
|
|
joined = errors.Join(joined, err)
|
|
}
|
|
}
|
|
w.closers = nil
|
|
return joined
|
|
}
|
|
|
|
// runtimeRecordsProbe adapts runtimerecordstore.Store to
|
|
// telemetry.RuntimeRecordsByStatusProbe by translating the typed status
|
|
// keys into the string keys the gauge expects.
|
|
type runtimeRecordsProbe struct {
|
|
store *runtimerecordstore.Store
|
|
}
|
|
|
|
func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe {
|
|
return &runtimeRecordsProbe{store: store}
|
|
}
|
|
|
|
func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) {
|
|
if p == nil || p.store == nil {
|
|
return nil, errors.New("runtime records probe: nil store")
|
|
}
|
|
counts, err := p.store.CountByStatus(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out := make(map[string]int, len(counts))
|
|
for status, count := range counts {
|
|
out[string(status)] = count
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// Compile-time assertions that the constructed adapters satisfy the
|
|
// expected port surfaces; these prevent silent regressions when a
|
|
// port shape changes.
|
|
var (
|
|
_ ports.RuntimeRecordStore = (*runtimerecordstore.Store)(nil)
|
|
_ ports.OperationLogStore = (*operationlogstore.Store)(nil)
|
|
_ ports.HealthSnapshotStore = (*healthsnapshotstore.Store)(nil)
|
|
_ ports.StreamOffsetStore = (*streamoffsets.Store)(nil)
|
|
_ ports.GameLeaseStore = (*gamelease.Store)(nil)
|
|
_ ports.DockerClient = (*docker.Client)(nil)
|
|
_ ports.LobbyInternalClient = (*lobbyclient.Client)(nil)
|
|
_ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil)
|
|
_ ports.HealthEventPublisher = (*healtheventspublisher.Publisher)(nil)
|
|
_ ports.JobResultPublisher = (*jobresultspublisher.Publisher)(nil)
|
|
|
|
_ Component = (*reconcile.Reconciler)(nil)
|
|
_ Component = (*containercleanup.Worker)(nil)
|
|
_ containercleanup.Cleaner = (*cleanupcontainer.Service)(nil)
|
|
)
|
|
|