feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
+632
View File
@@ -0,0 +1,632 @@
// Package config loads the Runtime Manager process configuration from
// environment variables.
package config
import (
"fmt"
"strings"
"time"
"galaxy/postgres"
"galaxy/redisconn"
"galaxy/rtmanager/internal/telemetry"
)
const (
envPrefix = "RTMANAGER"
shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT"
logLevelEnvVar = "RTMANAGER_LOG_LEVEL"
internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR"
internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT"
internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT"
internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT"
internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT"
dockerHostEnvVar = "RTMANAGER_DOCKER_HOST"
dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION"
dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK"
dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER"
dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS"
imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY"
defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA"
defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY"
defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT"
containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS"
containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS"
engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH"
engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME"
gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE"
gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID"
gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID"
gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT"
startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM"
stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM"
jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM"
healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"
notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM"
streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT"
inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL"
probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL"
probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT"
probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD"
reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL"
cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL"
gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS"
lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL"
lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT"
otelServiceNameEnvVar = "OTEL_SERVICE_NAME"
otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER"
otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER"
otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL"
otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"
otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"
otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED"
otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED"
defaultShutdownTimeout = 30 * time.Second
defaultLogLevel = "info"
defaultInternalHTTPAddr = ":8096"
defaultReadHeaderTimeout = 2 * time.Second
defaultReadTimeout = 5 * time.Second
defaultWriteTimeout = 15 * time.Second
defaultIdleTimeout = 60 * time.Second
defaultDockerHost = "unix:///var/run/docker.sock"
defaultDockerNetwork = "galaxy-net"
defaultDockerLogDriver = "json-file"
defaultImagePullPolicy = ImagePullPolicyIfMissing
defaultCPUQuota = 1.0
defaultMemory = "512m"
defaultPIDsLimit = 512
defaultContainerStopTimeout = 30 * time.Second
defaultContainerRetention = 30 * 24 * time.Hour
defaultEngineStateMountPath = "/var/lib/galaxy-game"
defaultEngineStateEnvName = "GAME_STATE_PATH"
defaultGameStateDirMode = 0o750
defaultStartJobsStream = "runtime:start_jobs"
defaultStopJobsStream = "runtime:stop_jobs"
defaultJobResultsStream = "runtime:job_results"
defaultHealthEventsStream = "runtime:health_events"
defaultNotificationIntentsKey = "notification:intents"
defaultStreamBlockTimeout = 5 * time.Second
defaultInspectInterval = 30 * time.Second
defaultProbeInterval = 15 * time.Second
defaultProbeTimeout = 2 * time.Second
defaultProbeFailuresThreshold = 3
defaultReconcileInterval = 5 * time.Minute
defaultCleanupInterval = time.Hour
defaultGameLeaseTTL = 60 * time.Second
defaultLobbyInternalTimeout = 2 * time.Second
defaultOTelServiceName = "galaxy-rtmanager"
)
// ImagePullPolicy enumerates the supported image pull policies. The start
// service validates a producer-supplied `image_ref` against this policy at
// start time.
type ImagePullPolicy string
// Supported pull policies, frozen by `rtmanager/README.md` §Configuration.
const (
ImagePullPolicyIfMissing ImagePullPolicy = "if_missing"
ImagePullPolicyAlways ImagePullPolicy = "always"
ImagePullPolicyNever ImagePullPolicy = "never"
)
// Validate reports whether p is one of the frozen pull policies.
func (p ImagePullPolicy) Validate() error {
switch p {
case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever:
return nil
default:
return fmt.Errorf("image pull policy %q must be one of %q, %q, %q",
p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever)
}
}
// Config stores the full Runtime Manager process configuration.
type Config struct {
// ShutdownTimeout bounds graceful shutdown of every long-lived
// component.
ShutdownTimeout time.Duration
// Logging configures the process-wide structured logger.
Logging LoggingConfig
// InternalHTTP configures the trusted internal HTTP listener that
// serves probes and the GM/Admin REST surface.
InternalHTTP InternalHTTPConfig
// Docker configures the Docker SDK client RTM uses to drive the local
// Docker daemon.
Docker DockerConfig
// Postgres configures the PostgreSQL-backed durable store consumed via
// `pkg/postgres`.
Postgres PostgresConfig
// Redis configures the shared Redis connection topology consumed via
// `pkg/redisconn`.
Redis RedisConfig
// Streams stores the stable Redis Stream names RTM reads from and
// writes to.
Streams StreamsConfig
// Container stores the per-container defaults applied at start time
// when the resolved image does not declare its own labels.
Container ContainerConfig
// Health configures the periodic health-monitoring workers (events
// listener, inspect, active probe).
Health HealthConfig
// Cleanup configures the reconciler and container-cleanup workers.
Cleanup CleanupConfig
// Coordination configures the per-game Redis lease used to serialise
// operations across all entry points.
Coordination CoordinationConfig
// Lobby configures the synchronous Lobby internal REST client used by
// the start service for ancillary lookups.
Lobby LobbyConfig
// Telemetry configures the process-wide OpenTelemetry runtime.
Telemetry TelemetryConfig
}
// LoggingConfig configures the process-wide structured logger.
type LoggingConfig struct {
// Level stores the process log level accepted by log/slog.
Level string
}
// InternalHTTPConfig configures the trusted internal HTTP listener.
type InternalHTTPConfig struct {
// Addr stores the TCP listen address.
Addr string
// ReadHeaderTimeout bounds request-header reading.
ReadHeaderTimeout time.Duration
// ReadTimeout bounds reading one request.
ReadTimeout time.Duration
// WriteTimeout bounds writing one response.
WriteTimeout time.Duration
// IdleTimeout bounds how long keep-alive connections stay open.
IdleTimeout time.Duration
}
// Validate reports whether cfg stores a usable internal HTTP listener
// configuration.
func (cfg InternalHTTPConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Addr) == "":
return fmt.Errorf("internal HTTP addr must not be empty")
case !isTCPAddr(cfg.Addr):
return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr)
case cfg.ReadHeaderTimeout <= 0:
return fmt.Errorf("internal HTTP read header timeout must be positive")
case cfg.ReadTimeout <= 0:
return fmt.Errorf("internal HTTP read timeout must be positive")
case cfg.WriteTimeout <= 0:
return fmt.Errorf("internal HTTP write timeout must be positive")
case cfg.IdleTimeout <= 0:
return fmt.Errorf("internal HTTP idle timeout must be positive")
default:
return nil
}
}
// DockerConfig configures the Docker SDK client.
type DockerConfig struct {
// Host stores the Docker daemon endpoint (e.g.
// `unix:///var/run/docker.sock`).
Host string
// APIVersion overrides the Docker API version. Empty lets the SDK
// negotiate.
APIVersion string
// Network stores the user-defined Docker bridge network containers
// attach to. Provisioned outside RTM; missing network is a fail-fast
// condition at startup.
Network string
// LogDriver stores the Docker logging driver applied to engine
// containers.
LogDriver string
// LogOpts stores the comma-separated `key=value` driver options.
LogOpts string
// PullPolicy stores the configured image pull policy.
PullPolicy ImagePullPolicy
}
// Validate reports whether cfg stores a usable Docker configuration.
func (cfg DockerConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.Host) == "":
return fmt.Errorf("docker host must not be empty")
case strings.TrimSpace(cfg.Network) == "":
return fmt.Errorf("docker network must not be empty")
case strings.TrimSpace(cfg.LogDriver) == "":
return fmt.Errorf("docker log driver must not be empty")
}
return cfg.PullPolicy.Validate()
}
// PostgresConfig configures the PostgreSQL-backed durable store consumed
// via `pkg/postgres`.
type PostgresConfig struct {
// Conn carries the primary plus replica DSN topology and pool tuning.
Conn postgres.Config
}
// Validate reports whether cfg stores a usable PostgreSQL configuration.
func (cfg PostgresConfig) Validate() error {
return cfg.Conn.Validate()
}
// RedisConfig configures the Runtime Manager Redis connection topology.
type RedisConfig struct {
// Conn carries the connection topology (master, replicas, password,
// db, per-call timeout).
Conn redisconn.Config
}
// Validate reports whether cfg stores a usable Redis configuration.
func (cfg RedisConfig) Validate() error {
return cfg.Conn.Validate()
}
// StreamsConfig stores the stable Redis Stream names used by Runtime
// Manager.
type StreamsConfig struct {
// StartJobs stores the Redis Streams key Lobby writes start jobs to.
StartJobs string
// StopJobs stores the Redis Streams key Lobby writes stop jobs to.
StopJobs string
// JobResults stores the Redis Streams key RTM writes job outcomes
// to.
JobResults string
// HealthEvents stores the Redis Streams key RTM publishes
// technical health events to.
HealthEvents string
// NotificationIntents stores the Redis Streams key RTM publishes
// admin-only notification intents to.
NotificationIntents string
// BlockTimeout bounds the maximum blocking read window for stream
// consumers.
BlockTimeout time.Duration
}
// Validate reports whether cfg stores usable stream names.
func (cfg StreamsConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.StartJobs) == "":
return fmt.Errorf("redis start jobs stream must not be empty")
case strings.TrimSpace(cfg.StopJobs) == "":
return fmt.Errorf("redis stop jobs stream must not be empty")
case strings.TrimSpace(cfg.JobResults) == "":
return fmt.Errorf("redis job results stream must not be empty")
case strings.TrimSpace(cfg.HealthEvents) == "":
return fmt.Errorf("redis health events stream must not be empty")
case strings.TrimSpace(cfg.NotificationIntents) == "":
return fmt.Errorf("redis notification intents stream must not be empty")
case cfg.BlockTimeout <= 0:
return fmt.Errorf("redis stream block timeout must be positive")
default:
return nil
}
}
// ContainerConfig stores the per-container defaults applied at start
// time. Resource defaults apply when the resolved engine image does not
// expose `com.galaxy.cpu_quota` / `com.galaxy.memory` /
// `com.galaxy.pids_limit` labels.
type ContainerConfig struct {
// DefaultCPUQuota is the fallback `--cpus` value applied when the
// image does not declare `com.galaxy.cpu_quota`.
DefaultCPUQuota float64
// DefaultMemory is the fallback `--memory` value applied when the
// image does not declare `com.galaxy.memory`.
DefaultMemory string
// DefaultPIDsLimit is the fallback `--pids-limit` value applied
// when the image does not declare `com.galaxy.pids_limit`.
DefaultPIDsLimit int
// StopTimeout bounds graceful container stop before Docker fires
// SIGKILL.
StopTimeout time.Duration
// Retention stores the TTL after which `status=stopped` containers
// are removed by the cleanup worker.
Retention time.Duration
// EngineStateMountPath is the in-container path the per-game state
// directory is bind-mounted to.
EngineStateMountPath string
// EngineStateEnvName is the env-var name forwarded to the engine
// pointing at EngineStateMountPath.
EngineStateEnvName string
// GameStateDirMode stores the unix permissions applied to the
// per-game state directory on creation.
GameStateDirMode uint32
// GameStateOwnerUID stores the unix uid applied to the per-game
// state directory on creation.
GameStateOwnerUID int
// GameStateOwnerGID stores the unix gid applied to the per-game
// state directory on creation.
GameStateOwnerGID int
// GameStateRoot is the host path under which per-game state
// directories are created.
GameStateRoot string
}
// Validate reports whether cfg stores usable container defaults.
func (cfg ContainerConfig) Validate() error {
switch {
case cfg.DefaultCPUQuota <= 0:
return fmt.Errorf("default cpu quota must be positive")
case strings.TrimSpace(cfg.DefaultMemory) == "":
return fmt.Errorf("default memory must not be empty")
case cfg.DefaultPIDsLimit <= 0:
return fmt.Errorf("default pids limit must be positive")
case cfg.StopTimeout <= 0:
return fmt.Errorf("container stop timeout must be positive")
case cfg.Retention <= 0:
return fmt.Errorf("container retention must be positive")
case strings.TrimSpace(cfg.EngineStateMountPath) == "":
return fmt.Errorf("engine state mount path must not be empty")
case strings.TrimSpace(cfg.EngineStateEnvName) == "":
return fmt.Errorf("engine state env name must not be empty")
case cfg.GameStateDirMode == 0:
return fmt.Errorf("game state dir mode must be non-zero")
case strings.TrimSpace(cfg.GameStateRoot) == "":
return fmt.Errorf("game state root must not be empty")
case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"):
return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot)
default:
return nil
}
}
// HealthConfig configures the periodic health-monitoring workers
// (Docker events listener, periodic inspect, active probe).
type HealthConfig struct {
// InspectInterval is the period between two periodic Docker inspect
// passes.
InspectInterval time.Duration
// ProbeInterval is the period between two engine `/healthz` probe
// rounds.
ProbeInterval time.Duration
// ProbeTimeout bounds one engine `/healthz` request.
ProbeTimeout time.Duration
// ProbeFailuresThreshold is the consecutive-failure count that
// triggers a `probe_failed` event.
ProbeFailuresThreshold int
}
// Validate reports whether cfg stores usable health-monitoring settings.
func (cfg HealthConfig) Validate() error {
switch {
case cfg.InspectInterval <= 0:
return fmt.Errorf("inspect interval must be positive")
case cfg.ProbeInterval <= 0:
return fmt.Errorf("probe interval must be positive")
case cfg.ProbeTimeout <= 0:
return fmt.Errorf("probe timeout must be positive")
case cfg.ProbeFailuresThreshold <= 0:
return fmt.Errorf("probe failures threshold must be positive")
default:
return nil
}
}
// CleanupConfig configures the reconciler and container-cleanup workers.
type CleanupConfig struct {
// ReconcileInterval is the period between two reconciler passes.
ReconcileInterval time.Duration
// CleanupInterval is the period between two container-cleanup
// passes.
CleanupInterval time.Duration
}
// Validate reports whether cfg stores usable cleanup settings.
func (cfg CleanupConfig) Validate() error {
switch {
case cfg.ReconcileInterval <= 0:
return fmt.Errorf("reconcile interval must be positive")
case cfg.CleanupInterval <= 0:
return fmt.Errorf("cleanup interval must be positive")
default:
return nil
}
}
// CoordinationConfig configures the per-game Redis lease.
type CoordinationConfig struct {
// GameLeaseTTL bounds the per-game lease lifetime renewed every
// half-TTL while an operation runs.
GameLeaseTTL time.Duration
}
// Validate reports whether cfg stores a usable lease configuration.
func (cfg CoordinationConfig) Validate() error {
if cfg.GameLeaseTTL <= 0 {
return fmt.Errorf("game lease ttl must be positive")
}
return nil
}
// LobbyConfig configures the synchronous Lobby internal REST client.
type LobbyConfig struct {
// BaseURL stores the trusted Lobby internal listener base URL.
BaseURL string
// Timeout bounds one Lobby internal request.
Timeout time.Duration
}
// Validate reports whether cfg stores a usable Lobby client
// configuration.
func (cfg LobbyConfig) Validate() error {
switch {
case strings.TrimSpace(cfg.BaseURL) == "":
return fmt.Errorf("lobby internal base url must not be empty")
case !isHTTPURL(cfg.BaseURL):
return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL)
case cfg.Timeout <= 0:
return fmt.Errorf("lobby internal timeout must be positive")
default:
return nil
}
}
// TelemetryConfig configures the Runtime Manager OpenTelemetry runtime.
type TelemetryConfig struct {
// ServiceName overrides the default OpenTelemetry service name.
ServiceName string
// TracesExporter selects the external traces exporter. Supported
// values are `none` and `otlp`.
TracesExporter string
// MetricsExporter selects the external metrics exporter. Supported
// values are `none` and `otlp`.
MetricsExporter string
// TracesProtocol selects the OTLP traces protocol when
// TracesExporter is `otlp`.
TracesProtocol string
// MetricsProtocol selects the OTLP metrics protocol when
// MetricsExporter is `otlp`.
MetricsProtocol string
// StdoutTracesEnabled enables the additional stdout trace exporter
// used for local development and debugging.
StdoutTracesEnabled bool
// StdoutMetricsEnabled enables the additional stdout metric
// exporter used for local development and debugging.
StdoutMetricsEnabled bool
}
// Validate reports whether cfg contains a supported OpenTelemetry
// configuration.
func (cfg TelemetryConfig) Validate() error {
return telemetry.ProcessConfig{
ServiceName: cfg.ServiceName,
TracesExporter: cfg.TracesExporter,
MetricsExporter: cfg.MetricsExporter,
TracesProtocol: cfg.TracesProtocol,
MetricsProtocol: cfg.MetricsProtocol,
StdoutTracesEnabled: cfg.StdoutTracesEnabled,
StdoutMetricsEnabled: cfg.StdoutMetricsEnabled,
}.Validate()
}
// DefaultConfig returns the default Runtime Manager process configuration.
func DefaultConfig() Config {
return Config{
ShutdownTimeout: defaultShutdownTimeout,
Logging: LoggingConfig{
Level: defaultLogLevel,
},
InternalHTTP: InternalHTTPConfig{
Addr: defaultInternalHTTPAddr,
ReadHeaderTimeout: defaultReadHeaderTimeout,
ReadTimeout: defaultReadTimeout,
WriteTimeout: defaultWriteTimeout,
IdleTimeout: defaultIdleTimeout,
},
Docker: DockerConfig{
Host: defaultDockerHost,
Network: defaultDockerNetwork,
LogDriver: defaultDockerLogDriver,
PullPolicy: defaultImagePullPolicy,
},
Postgres: PostgresConfig{
Conn: postgres.DefaultConfig(),
},
Redis: RedisConfig{
Conn: redisconn.DefaultConfig(),
},
Streams: StreamsConfig{
StartJobs: defaultStartJobsStream,
StopJobs: defaultStopJobsStream,
JobResults: defaultJobResultsStream,
HealthEvents: defaultHealthEventsStream,
NotificationIntents: defaultNotificationIntentsKey,
BlockTimeout: defaultStreamBlockTimeout,
},
Container: ContainerConfig{
DefaultCPUQuota: defaultCPUQuota,
DefaultMemory: defaultMemory,
DefaultPIDsLimit: defaultPIDsLimit,
StopTimeout: defaultContainerStopTimeout,
Retention: defaultContainerRetention,
EngineStateMountPath: defaultEngineStateMountPath,
EngineStateEnvName: defaultEngineStateEnvName,
GameStateDirMode: defaultGameStateDirMode,
},
Health: HealthConfig{
InspectInterval: defaultInspectInterval,
ProbeInterval: defaultProbeInterval,
ProbeTimeout: defaultProbeTimeout,
ProbeFailuresThreshold: defaultProbeFailuresThreshold,
},
Cleanup: CleanupConfig{
ReconcileInterval: defaultReconcileInterval,
CleanupInterval: defaultCleanupInterval,
},
Coordination: CoordinationConfig{
GameLeaseTTL: defaultGameLeaseTTL,
},
Lobby: LobbyConfig{
Timeout: defaultLobbyInternalTimeout,
},
Telemetry: TelemetryConfig{
ServiceName: defaultOTelServiceName,
TracesExporter: "none",
MetricsExporter: "none",
},
}
}
+142
View File
@@ -0,0 +1,142 @@
package config
import (
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func validEnv(t *testing.T) {
t.Helper()
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy?search_path=rtmanager&sslmode=disable")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
}
func TestLoadFromEnvAcceptsDefaults(t *testing.T) {
validEnv(t)
cfg, err := LoadFromEnv()
require.NoError(t, err)
require.Equal(t, ":8096", cfg.InternalHTTP.Addr)
require.Equal(t, "unix:///var/run/docker.sock", cfg.Docker.Host)
require.Equal(t, "galaxy-net", cfg.Docker.Network)
require.Equal(t, "json-file", cfg.Docker.LogDriver)
require.Equal(t, ImagePullPolicyIfMissing, cfg.Docker.PullPolicy)
require.Equal(t, "runtime:start_jobs", cfg.Streams.StartJobs)
require.Equal(t, "runtime:stop_jobs", cfg.Streams.StopJobs)
require.Equal(t, "runtime:job_results", cfg.Streams.JobResults)
require.Equal(t, "runtime:health_events", cfg.Streams.HealthEvents)
require.Equal(t, "notification:intents", cfg.Streams.NotificationIntents)
require.Equal(t, 30*time.Second, cfg.Container.StopTimeout)
require.Equal(t, 30*24*time.Hour, cfg.Container.Retention)
require.Equal(t, "/var/lib/galaxy-game", cfg.Container.EngineStateMountPath)
require.Equal(t, "GAME_STATE_PATH", cfg.Container.EngineStateEnvName)
require.EqualValues(t, 0o750, cfg.Container.GameStateDirMode)
require.Equal(t, 60*time.Second, cfg.Coordination.GameLeaseTTL)
require.Equal(t, "http://lobby:8095", cfg.Lobby.BaseURL)
require.Equal(t, 2*time.Second, cfg.Lobby.Timeout)
require.Equal(t, "galaxy-rtmanager", cfg.Telemetry.ServiceName)
}
func TestLoadFromEnvHonoursOverrides(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_INTERNAL_HTTP_ADDR", ":9000")
t.Setenv("RTMANAGER_DOCKER_NETWORK", "custom-net")
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "always")
t.Setenv("RTMANAGER_REDIS_START_JOBS_STREAM", "custom:start_jobs")
t.Setenv("RTMANAGER_GAME_LEASE_TTL_SECONDS", "120")
t.Setenv("RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS", "45")
t.Setenv("RTMANAGER_CONTAINER_RETENTION_DAYS", "7")
t.Setenv("RTMANAGER_GAME_STATE_DIR_MODE", "0700")
cfg, err := LoadFromEnv()
require.NoError(t, err)
require.Equal(t, ":9000", cfg.InternalHTTP.Addr)
require.Equal(t, "custom-net", cfg.Docker.Network)
require.Equal(t, ImagePullPolicyAlways, cfg.Docker.PullPolicy)
require.Equal(t, "custom:start_jobs", cfg.Streams.StartJobs)
require.Equal(t, 120*time.Second, cfg.Coordination.GameLeaseTTL)
require.Equal(t, 45*time.Second, cfg.Container.StopTimeout)
require.Equal(t, 7*24*time.Hour, cfg.Container.Retention)
require.EqualValues(t, 0o700, cfg.Container.GameStateDirMode)
}
func TestLoadFromEnvRejectsUnknownPullPolicy(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "weekly")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "image pull policy")
}
func TestLoadFromEnvRequiresGameStateRoot(t *testing.T) {
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_GAME_STATE_ROOT")
}
func TestLoadFromEnvRequiresLobbyBaseURL(t *testing.T) {
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_LOBBY_INTERNAL_BASE_URL")
}
func TestLoadFromEnvRejectsRelativeStateRoot(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "relative/path")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "absolute path")
}
func TestLoadFromEnvRejectsBadLogLevel(t *testing.T) {
validEnv(t)
t.Setenv("RTMANAGER_LOG_LEVEL", "verbose")
_, err := LoadFromEnv()
require.Error(t, err)
require.Contains(t, err.Error(), "RTMANAGER_LOG_LEVEL")
}
func TestImagePullPolicyValidate(t *testing.T) {
require.NoError(t, ImagePullPolicyIfMissing.Validate())
require.NoError(t, ImagePullPolicyAlways.Validate())
require.NoError(t, ImagePullPolicyNever.Validate())
require.Error(t, ImagePullPolicy("monthly").Validate())
}
func TestInternalHTTPValidateRejectsBadAddr(t *testing.T) {
cfg := DefaultConfig().InternalHTTP
cfg.Addr = "not-an-addr"
err := cfg.Validate()
require.Error(t, err)
require.Contains(t, err.Error(), "host:port")
}
func TestStreamsValidateRequiresAllNames(t *testing.T) {
cfg := DefaultConfig().Streams
cfg.StartJobs = " "
err := cfg.Validate()
require.Error(t, err)
require.True(t, strings.Contains(err.Error(), "start jobs"))
}
+319
View File
@@ -0,0 +1,319 @@
package config
import (
"fmt"
"os"
"strconv"
"strings"
"time"
"galaxy/postgres"
"galaxy/redisconn"
)
// LoadFromEnv builds Config from environment variables and validates the
// resulting configuration.
func LoadFromEnv() (Config, error) {
cfg := DefaultConfig()
var err error
cfg.ShutdownTimeout, err = durationEnv(shutdownTimeoutEnvVar, cfg.ShutdownTimeout)
if err != nil {
return Config{}, err
}
cfg.Logging.Level = stringEnv(logLevelEnvVar, cfg.Logging.Level)
cfg.InternalHTTP.Addr = stringEnv(internalHTTPAddrEnvVar, cfg.InternalHTTP.Addr)
cfg.InternalHTTP.ReadHeaderTimeout, err = durationEnv(internalHTTPReadHeaderTimeoutEnvVar, cfg.InternalHTTP.ReadHeaderTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.ReadTimeout, err = durationEnv(internalHTTPReadTimeoutEnvVar, cfg.InternalHTTP.ReadTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.WriteTimeout, err = durationEnv(internalHTTPWriteTimeoutEnvVar, cfg.InternalHTTP.WriteTimeout)
if err != nil {
return Config{}, err
}
cfg.InternalHTTP.IdleTimeout, err = durationEnv(internalHTTPIdleTimeoutEnvVar, cfg.InternalHTTP.IdleTimeout)
if err != nil {
return Config{}, err
}
cfg.Docker.Host = stringEnv(dockerHostEnvVar, cfg.Docker.Host)
cfg.Docker.APIVersion = stringEnv(dockerAPIVersionEnvVar, cfg.Docker.APIVersion)
cfg.Docker.Network = stringEnv(dockerNetworkEnvVar, cfg.Docker.Network)
cfg.Docker.LogDriver = stringEnv(dockerLogDriverEnvVar, cfg.Docker.LogDriver)
cfg.Docker.LogOpts = stringEnv(dockerLogOptsEnvVar, cfg.Docker.LogOpts)
if raw, ok := os.LookupEnv(imagePullPolicyEnvVar); ok {
cfg.Docker.PullPolicy = ImagePullPolicy(strings.TrimSpace(raw))
}
pgConn, err := postgres.LoadFromEnv(envPrefix)
if err != nil {
return Config{}, err
}
cfg.Postgres.Conn = pgConn
redisConn, err := redisconn.LoadFromEnv(envPrefix)
if err != nil {
return Config{}, err
}
cfg.Redis.Conn = redisConn
cfg.Streams.StartJobs = stringEnv(startJobsStreamEnvVar, cfg.Streams.StartJobs)
cfg.Streams.StopJobs = stringEnv(stopJobsStreamEnvVar, cfg.Streams.StopJobs)
cfg.Streams.JobResults = stringEnv(jobResultsStreamEnvVar, cfg.Streams.JobResults)
cfg.Streams.HealthEvents = stringEnv(healthEventsStreamEnvVar, cfg.Streams.HealthEvents)
cfg.Streams.NotificationIntents = stringEnv(notificationIntentsStreamEnv, cfg.Streams.NotificationIntents)
cfg.Streams.BlockTimeout, err = durationEnv(streamBlockTimeoutEnvVar, cfg.Streams.BlockTimeout)
if err != nil {
return Config{}, err
}
cfg.Container.DefaultCPUQuota, err = floatEnv(defaultCPUQuotaEnvVar, cfg.Container.DefaultCPUQuota)
if err != nil {
return Config{}, err
}
cfg.Container.DefaultMemory = stringEnv(defaultMemoryEnvVar, cfg.Container.DefaultMemory)
cfg.Container.DefaultPIDsLimit, err = intEnv(defaultPIDsLimitEnvVar, cfg.Container.DefaultPIDsLimit)
if err != nil {
return Config{}, err
}
cfg.Container.StopTimeout, err = secondsEnv(containerStopTimeoutSecondsEnvVar, cfg.Container.StopTimeout)
if err != nil {
return Config{}, err
}
cfg.Container.Retention, err = daysEnv(containerRetentionDaysEnvVar, cfg.Container.Retention)
if err != nil {
return Config{}, err
}
cfg.Container.EngineStateMountPath = stringEnv(engineStateMountPathEnvVar, cfg.Container.EngineStateMountPath)
cfg.Container.EngineStateEnvName = stringEnv(engineStateEnvNameEnvVar, cfg.Container.EngineStateEnvName)
cfg.Container.GameStateDirMode, err = octalUint32Env(gameStateDirModeEnvVar, cfg.Container.GameStateDirMode)
if err != nil {
return Config{}, err
}
cfg.Container.GameStateOwnerUID, err = intEnv(gameStateOwnerUIDEnvVar, cfg.Container.GameStateOwnerUID)
if err != nil {
return Config{}, err
}
cfg.Container.GameStateOwnerGID, err = intEnv(gameStateOwnerGIDEnvVar, cfg.Container.GameStateOwnerGID)
if err != nil {
return Config{}, err
}
root, ok := os.LookupEnv(gameStateRootEnvVar)
if !ok || strings.TrimSpace(root) == "" {
return Config{}, fmt.Errorf("%s must be set", gameStateRootEnvVar)
}
cfg.Container.GameStateRoot = strings.TrimSpace(root)
cfg.Health.InspectInterval, err = durationEnv(inspectIntervalEnvVar, cfg.Health.InspectInterval)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeInterval, err = durationEnv(probeIntervalEnvVar, cfg.Health.ProbeInterval)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeTimeout, err = durationEnv(probeTimeoutEnvVar, cfg.Health.ProbeTimeout)
if err != nil {
return Config{}, err
}
cfg.Health.ProbeFailuresThreshold, err = intEnv(probeFailuresThresholdEnvVar, cfg.Health.ProbeFailuresThreshold)
if err != nil {
return Config{}, err
}
cfg.Cleanup.ReconcileInterval, err = durationEnv(reconcileIntervalEnvVar, cfg.Cleanup.ReconcileInterval)
if err != nil {
return Config{}, err
}
cfg.Cleanup.CleanupInterval, err = durationEnv(cleanupIntervalEnvVar, cfg.Cleanup.CleanupInterval)
if err != nil {
return Config{}, err
}
cfg.Coordination.GameLeaseTTL, err = secondsEnv(gameLeaseTTLSecondsEnvVar, cfg.Coordination.GameLeaseTTL)
if err != nil {
return Config{}, err
}
lobbyURL, ok := os.LookupEnv(lobbyInternalBaseURLEnvVar)
if !ok || strings.TrimSpace(lobbyURL) == "" {
return Config{}, fmt.Errorf("%s must be set", lobbyInternalBaseURLEnvVar)
}
cfg.Lobby.BaseURL = strings.TrimSpace(lobbyURL)
cfg.Lobby.Timeout, err = durationEnv(lobbyInternalTimeoutEnvVar, cfg.Lobby.Timeout)
if err != nil {
return Config{}, err
}
cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName)
cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter))
cfg.Telemetry.MetricsExporter = normalizeExporterValue(stringEnv(otelMetricsExporterEnvVar, cfg.Telemetry.MetricsExporter))
cfg.Telemetry.TracesProtocol = normalizeProtocolValue(
os.Getenv(otelExporterOTLPTracesProtocolEnvVar),
os.Getenv(otelExporterOTLPProtocolEnvVar),
cfg.Telemetry.TracesProtocol,
)
cfg.Telemetry.MetricsProtocol = normalizeProtocolValue(
os.Getenv(otelExporterOTLPMetricsProtocolEnvVar),
os.Getenv(otelExporterOTLPProtocolEnvVar),
cfg.Telemetry.MetricsProtocol,
)
cfg.Telemetry.StdoutTracesEnabled, err = boolEnv(otelStdoutTracesEnabledEnvVar, cfg.Telemetry.StdoutTracesEnabled)
if err != nil {
return Config{}, err
}
cfg.Telemetry.StdoutMetricsEnabled, err = boolEnv(otelStdoutMetricsEnabledEnvVar, cfg.Telemetry.StdoutMetricsEnabled)
if err != nil {
return Config{}, err
}
if err := cfg.Validate(); err != nil {
return Config{}, err
}
return cfg, nil
}
func stringEnv(name string, fallback string) string {
value, ok := os.LookupEnv(name)
if !ok {
return fallback
}
return strings.TrimSpace(value)
}
func durationEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := time.ParseDuration(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse duration: %w", name, err)
}
return parsed, nil
}
func secondsEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse seconds: %w", name, err)
}
if parsed <= 0 {
return 0, fmt.Errorf("%s: must be positive", name)
}
return time.Duration(parsed) * time.Second, nil
}
func daysEnv(name string, fallback time.Duration) (time.Duration, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse days: %w", name, err)
}
if parsed <= 0 {
return 0, fmt.Errorf("%s: must be positive", name)
}
return time.Duration(parsed) * 24 * time.Hour, nil
}
func intEnv(name string, fallback int) (int, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, fmt.Errorf("%s: parse int: %w", name, err)
}
return parsed, nil
}
func floatEnv(name string, fallback float64) (float64, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseFloat(strings.TrimSpace(value), 64)
if err != nil {
return 0, fmt.Errorf("%s: parse float: %w", name, err)
}
return parsed, nil
}
func boolEnv(name string, fallback bool) (bool, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseBool(strings.TrimSpace(value))
if err != nil {
return false, fmt.Errorf("%s: parse bool: %w", name, err)
}
return parsed, nil
}
func octalUint32Env(name string, fallback uint32) (uint32, error) {
value, ok := os.LookupEnv(name)
if !ok {
return fallback, nil
}
parsed, err := strconv.ParseUint(strings.TrimSpace(value), 8, 32)
if err != nil {
return 0, fmt.Errorf("%s: parse octal: %w", name, err)
}
return uint32(parsed), nil
}
func normalizeExporterValue(value string) string {
trimmed := strings.TrimSpace(value)
switch trimmed {
case "", "none":
return "none"
default:
return trimmed
}
}
func normalizeProtocolValue(primary string, fallback string, defaultValue string) string {
primary = strings.TrimSpace(primary)
if primary != "" {
return primary
}
fallback = strings.TrimSpace(fallback)
if fallback != "" {
return fallback
}
return strings.TrimSpace(defaultValue)
}
+93
View File
@@ -0,0 +1,93 @@
package config
import (
"fmt"
"log/slog"
"net"
"net/url"
"strings"
)
// Validate reports whether cfg stores a usable Runtime Manager process
// configuration.
func (cfg Config) Validate() error {
if cfg.ShutdownTimeout <= 0 {
return fmt.Errorf("%s must be positive", shutdownTimeoutEnvVar)
}
if err := validateSlogLevel(cfg.Logging.Level); err != nil {
return fmt.Errorf("%s: %w", logLevelEnvVar, err)
}
if err := cfg.InternalHTTP.Validate(); err != nil {
return err
}
if err := cfg.Docker.Validate(); err != nil {
return err
}
if err := cfg.Postgres.Validate(); err != nil {
return err
}
if err := cfg.Redis.Validate(); err != nil {
return err
}
if err := cfg.Streams.Validate(); err != nil {
return err
}
if err := cfg.Container.Validate(); err != nil {
return err
}
if err := cfg.Health.Validate(); err != nil {
return err
}
if err := cfg.Cleanup.Validate(); err != nil {
return err
}
if err := cfg.Coordination.Validate(); err != nil {
return err
}
if err := cfg.Lobby.Validate(); err != nil {
return err
}
if err := cfg.Telemetry.Validate(); err != nil {
return err
}
return nil
}
func validateSlogLevel(level string) error {
var slogLevel slog.Level
if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil {
return fmt.Errorf("invalid slog level %q: %w", level, err)
}
return nil
}
func isTCPAddr(value string) bool {
host, port, err := net.SplitHostPort(strings.TrimSpace(value))
if err != nil {
return false
}
if port == "" {
return false
}
if host == "" {
return true
}
return !strings.Contains(host, " ")
}
func isHTTPURL(value string) bool {
parsed, err := url.Parse(strings.TrimSpace(value))
if err != nil {
return false
}
if parsed.Scheme != "http" && parsed.Scheme != "https" {
return false
}
return parsed.Host != ""
}