feat: runtime manager
This commit is contained in:
@@ -0,0 +1,336 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PullPolicy enumerates the supported image pull policies. The value
|
||||
// set mirrors `config.ImagePullPolicy`; the runtime/wiring layer
|
||||
// translates between the two so the docker adapter does not import
|
||||
// `internal/config` and the port package stays free of configuration
|
||||
// concerns.
|
||||
type PullPolicy string
|
||||
|
||||
// Supported pull policies, frozen by `rtmanager/README.md §Configuration`.
|
||||
const (
|
||||
// PullPolicyIfMissing pulls the image only when it is absent from
|
||||
// the local Docker daemon.
|
||||
PullPolicyIfMissing PullPolicy = "if_missing"
|
||||
|
||||
// PullPolicyAlways pulls the image on every start.
|
||||
PullPolicyAlways PullPolicy = "always"
|
||||
|
||||
// PullPolicyNever skips the pull and fails the start when the image
|
||||
// is absent.
|
||||
PullPolicyNever PullPolicy = "never"
|
||||
)
|
||||
|
||||
// IsKnown reports whether policy belongs to the frozen pull-policy
|
||||
// vocabulary.
|
||||
func (policy PullPolicy) IsKnown() bool {
|
||||
switch policy {
|
||||
case PullPolicyIfMissing, PullPolicyAlways, PullPolicyNever:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient
|
||||
|
||||
// DockerClient is the narrow Docker port Runtime Manager uses. The
|
||||
// production adapter wraps `github.com/docker/docker/client`; service
|
||||
// tests use a generated mock. The surface intentionally exposes only
|
||||
// the operations RTM needs; `docker logs` and stream attach are out
|
||||
// of scope for v1.
|
||||
type DockerClient interface {
|
||||
// EnsureNetwork verifies the configured Docker network is present
|
||||
// on the daemon. It returns ErrNetworkMissing when the network does
|
||||
// not exist; RTM never creates networks itself.
|
||||
EnsureNetwork(ctx context.Context, name string) error
|
||||
|
||||
// PullImage pulls ref according to policy. It returns nil on
|
||||
// success and a wrapped Docker error otherwise. Implementations
|
||||
// honour PullPolicyNever by skipping the pull and returning nil
|
||||
// when the image is already present, or returning ErrImageNotFound
|
||||
// otherwise.
|
||||
PullImage(ctx context.Context, ref string, policy PullPolicy) error
|
||||
|
||||
// InspectImage returns image metadata for ref. It returns
|
||||
// ErrImageNotFound when no such image exists locally.
|
||||
InspectImage(ctx context.Context, ref string) (ImageInspect, error)
|
||||
|
||||
// InspectContainer returns container metadata for containerID. It
|
||||
// returns ErrContainerNotFound when no such container exists.
|
||||
InspectContainer(ctx context.Context, containerID string) (ContainerInspect, error)
|
||||
|
||||
// Run creates and starts one container according to spec. The
|
||||
// returned RunResult carries the assigned container id, the stable
|
||||
// engine endpoint, and the wall-clock observed by the daemon.
|
||||
Run(ctx context.Context, spec RunSpec) (RunResult, error)
|
||||
|
||||
// Stop sends SIGTERM to the container followed by SIGKILL after
|
||||
// timeout. It returns nil when the container exited cleanly and
|
||||
// ErrContainerNotFound when it is already gone.
|
||||
Stop(ctx context.Context, containerID string, timeout time.Duration) error
|
||||
|
||||
// Remove removes the container. It returns nil when the container
|
||||
// no longer exists (idempotent removal).
|
||||
Remove(ctx context.Context, containerID string) error
|
||||
|
||||
// List returns container summaries that match filter. Implementations
|
||||
// translate ListFilter into the appropriate Docker filters argument.
|
||||
List(ctx context.Context, filter ListFilter) ([]ContainerSummary, error)
|
||||
|
||||
// EventsListen subscribes to the Docker events stream and returns
|
||||
// the decoded event channel together with an asynchronous error
|
||||
// channel. The caller cancels ctx to terminate the subscription.
|
||||
// Implementations close events when the subscription terminates.
|
||||
EventsListen(ctx context.Context) (events <-chan DockerEvent, errs <-chan error, err error)
|
||||
}
|
||||
|
||||
// RunSpec stores the request shape used by DockerClient.Run.
|
||||
type RunSpec struct {
|
||||
// Name stores the container name (typically `galaxy-game-{game_id}`).
|
||||
Name string
|
||||
|
||||
// Image stores the image reference resolved by the producer.
|
||||
Image string
|
||||
|
||||
// Hostname stores the container hostname assigned for the embedded
|
||||
// Docker DNS to resolve from other containers on the network.
|
||||
Hostname string
|
||||
|
||||
// Network stores the user-defined Docker network the container
|
||||
// attaches to.
|
||||
Network string
|
||||
|
||||
// Env stores the environment variables forwarded to the container
|
||||
// (e.g. GAME_STATE_PATH, STORAGE_PATH).
|
||||
Env map[string]string
|
||||
|
||||
// Cmd overrides the entrypoint arguments for the container. Production
|
||||
// callers leave it nil so the engine image's own CMD runs; tests use
|
||||
// it to drive a tiny container that does not embed RTM-specific
|
||||
// behaviour. Empty Cmd means "use image default", which mirrors the
|
||||
// Docker SDK contract.
|
||||
Cmd []string
|
||||
|
||||
// Labels stores the labels applied to the container so the
|
||||
// reconciler and the events listener can identify it.
|
||||
Labels map[string]string
|
||||
|
||||
// BindMounts stores the host-to-container bind mounts. RTM uses
|
||||
// exactly one mount in v1 (the per-game state directory).
|
||||
BindMounts []BindMount
|
||||
|
||||
// LogDriver stores the Docker logging driver name.
|
||||
LogDriver string
|
||||
|
||||
// LogOpts stores the logging-driver options as key=value pairs.
|
||||
LogOpts map[string]string
|
||||
|
||||
// CPUQuota stores the `--cpus` value applied as a resource limit.
|
||||
CPUQuota float64
|
||||
|
||||
// Memory stores the `--memory` value (e.g. `512m`) applied as a
|
||||
// resource limit.
|
||||
Memory string
|
||||
|
||||
// PIDsLimit stores the `--pids-limit` value.
|
||||
PIDsLimit int
|
||||
}
|
||||
|
||||
// BindMount stores one host-to-container bind mount.
|
||||
type BindMount struct {
|
||||
// HostPath stores the absolute host path bound into the container.
|
||||
HostPath string
|
||||
|
||||
// MountPath stores the absolute in-container path the host
|
||||
// directory is mounted at.
|
||||
MountPath string
|
||||
|
||||
// ReadOnly mounts the host path read-only when true.
|
||||
ReadOnly bool
|
||||
}
|
||||
|
||||
// RunResult stores the response shape returned by DockerClient.Run.
|
||||
type RunResult struct {
|
||||
// ContainerID identifies the created container.
|
||||
ContainerID string
|
||||
|
||||
// EngineEndpoint stores the stable URL Game Master uses to reach
|
||||
// the engine container.
|
||||
EngineEndpoint string
|
||||
|
||||
// StartedAt stores the wall-clock the daemon observed for the
|
||||
// start event.
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
// ImageInspect stores the subset of `docker image inspect` fields RTM
|
||||
// reads. Only Labels are required at start time (resource limits live
|
||||
// there); other fields may be populated when convenient for diagnostics.
|
||||
type ImageInspect struct {
|
||||
// Ref stores the image reference the inspection was scoped to.
|
||||
Ref string
|
||||
|
||||
// Labels stores the image-level labels (e.g.
|
||||
// `com.galaxy.cpu_quota`).
|
||||
Labels map[string]string
|
||||
}
|
||||
|
||||
// ContainerInspect stores the subset of `docker inspect` fields RTM
|
||||
// reads from a running or exited container.
|
||||
type ContainerInspect struct {
|
||||
// ID identifies the container.
|
||||
ID string
|
||||
|
||||
// ImageRef stores the image reference the container was started
|
||||
// from.
|
||||
ImageRef string
|
||||
|
||||
// Hostname stores the container hostname.
|
||||
Hostname string
|
||||
|
||||
// Labels stores the container labels assigned at create time.
|
||||
Labels map[string]string
|
||||
|
||||
// Status stores the verbatim Docker `State.Status` value (e.g.
|
||||
// `running`, `exited`).
|
||||
Status string
|
||||
|
||||
// Health stores the verbatim Docker `State.Health.Status` value
|
||||
// (e.g. `healthy`, `unhealthy`). Empty when the image declares no
|
||||
// HEALTHCHECK.
|
||||
Health string
|
||||
|
||||
// RestartCount stores the Docker `RestartCount` observed at
|
||||
// inspection time.
|
||||
RestartCount int
|
||||
|
||||
// StartedAt stores the daemon-observed start wall-clock.
|
||||
StartedAt time.Time
|
||||
|
||||
// FinishedAt stores the daemon-observed exit wall-clock. Zero when
|
||||
// the container is still running.
|
||||
FinishedAt time.Time
|
||||
|
||||
// ExitCode stores the exit code reported by the daemon. Zero when
|
||||
// the container is still running.
|
||||
ExitCode int
|
||||
|
||||
// OOMKilled reports whether the container was killed by the OOM
|
||||
// killer.
|
||||
OOMKilled bool
|
||||
}
|
||||
|
||||
// ContainerSummary stores the subset of `docker ps` fields RTM reads.
|
||||
type ContainerSummary struct {
|
||||
// ID identifies the container.
|
||||
ID string
|
||||
|
||||
// ImageRef stores the image reference.
|
||||
ImageRef string
|
||||
|
||||
// Hostname stores the container hostname.
|
||||
Hostname string
|
||||
|
||||
// Labels stores the container labels assigned at create time.
|
||||
Labels map[string]string
|
||||
|
||||
// Status stores the verbatim Docker `State.Status` value.
|
||||
Status string
|
||||
|
||||
// StartedAt stores the daemon-observed start wall-clock.
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
// ListFilter stores the criteria used by DockerClient.List.
|
||||
type ListFilter struct {
|
||||
// Labels stores label key=value pairs that must all be present on
|
||||
// the container. Empty matches every container.
|
||||
Labels map[string]string
|
||||
}
|
||||
|
||||
// DockerEvent stores one decoded entry from the Docker events stream.
|
||||
// RTM only consumes container-scoped events.
|
||||
type DockerEvent struct {
|
||||
// Action stores the Docker event action verbatim (e.g. `start`,
|
||||
// `die`, `oom`, `destroy`).
|
||||
Action string
|
||||
|
||||
// ContainerID identifies the container the event refers to.
|
||||
ContainerID string
|
||||
|
||||
// Labels stores the container labels carried by the event
|
||||
// attributes when present.
|
||||
Labels map[string]string
|
||||
|
||||
// ExitCode stores the exit code attribute when applicable (e.g.
|
||||
// `die` events). Zero when the action does not carry one.
|
||||
ExitCode int
|
||||
|
||||
// OccurredAt stores the daemon-observed event wall-clock.
|
||||
OccurredAt time.Time
|
||||
}
|
||||
|
||||
// String returns policy as its stored enum value. Convenient for use in
|
||||
// log fields and error messages.
|
||||
func (policy PullPolicy) String() string {
|
||||
return string(policy)
|
||||
}
|
||||
|
||||
// ErrNetworkMissing reports that the configured Docker network is not
|
||||
// present on the daemon.
|
||||
var ErrNetworkMissing = errors.New("docker network missing")
|
||||
|
||||
// ErrImageNotFound reports that an image reference does not resolve to
|
||||
// a local Docker image.
|
||||
var ErrImageNotFound = errors.New("docker image not found")
|
||||
|
||||
// ErrContainerNotFound reports that a container id does not resolve to
|
||||
// a Docker container.
|
||||
var ErrContainerNotFound = errors.New("docker container not found")
|
||||
|
||||
// Validate reports whether spec carries the structural invariants
|
||||
// required by DockerClient.Run. Adapters use it as the first defence
|
||||
// against malformed specs originating in service code.
|
||||
func (spec RunSpec) Validate() error {
|
||||
if spec.Name == "" {
|
||||
return fmt.Errorf("run spec: name must not be empty")
|
||||
}
|
||||
if spec.Image == "" {
|
||||
return fmt.Errorf("run spec: image must not be empty")
|
||||
}
|
||||
if spec.Hostname == "" {
|
||||
return fmt.Errorf("run spec: hostname must not be empty")
|
||||
}
|
||||
if spec.Network == "" {
|
||||
return fmt.Errorf("run spec: network must not be empty")
|
||||
}
|
||||
if spec.LogDriver == "" {
|
||||
return fmt.Errorf("run spec: log driver must not be empty")
|
||||
}
|
||||
if spec.CPUQuota <= 0 {
|
||||
return fmt.Errorf("run spec: cpu quota must be positive")
|
||||
}
|
||||
if spec.Memory == "" {
|
||||
return fmt.Errorf("run spec: memory must not be empty")
|
||||
}
|
||||
if spec.PIDsLimit <= 0 {
|
||||
return fmt.Errorf("run spec: pids limit must be positive")
|
||||
}
|
||||
for index, mount := range spec.BindMounts {
|
||||
if mount.HostPath == "" {
|
||||
return fmt.Errorf("run spec: bind mounts[%d]: host path must not be empty", index)
|
||||
}
|
||||
if mount.MountPath == "" {
|
||||
return fmt.Errorf("run spec: bind mounts[%d]: mount path must not be empty", index)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GameLeaseStore guards every lifecycle operation Runtime Manager runs
|
||||
// against one game. The lease serialises starts, stops, restarts, patches,
|
||||
// and cleanup operations on the same `game_id` across all entry points
|
||||
// (Lobby stream consumer, GM REST handler, Admin REST handler, periodic
|
||||
// workers) so concurrent operations cannot corrupt each other's
|
||||
// intermediate Docker / PostgreSQL state.
|
||||
//
|
||||
// The lease is a per-game key with a random token. Adapters use SETNX with
|
||||
// PX TTL on TryAcquire and a compare-and-delete on Release so a publisher
|
||||
// that lost the lease (TTL expiry, replica swap) cannot clear another
|
||||
// caller's claim.
|
||||
//
|
||||
// In v1 the lease is not renewed mid-operation; callers must keep the
|
||||
// total operation duration below the configured TTL
|
||||
// (`RTMANAGER_GAME_LEASE_TTL_SECONDS`, default 60s). Multi-GB image pulls
|
||||
// can exceed this in production and remain a known limitation; later
|
||||
// stages may introduce a renewal helper if it bites.
|
||||
type GameLeaseStore interface {
|
||||
// TryAcquire attempts to acquire the per-game lease for gameID owned
|
||||
// by token for ttl. It returns true when the lease was acquired and
|
||||
// false when another holder still owns it. A non-nil error reports
|
||||
// transport-level failures (Redis unreachable, network timeout) and
|
||||
// must not be confused with a missed lease.
|
||||
TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (acquired bool, err error)
|
||||
|
||||
// Release removes the per-game lease for gameID only when token still
|
||||
// matches the stored owner value. Releasing a lease the caller no
|
||||
// longer owns is a silent no-op so a TTL-driven release race never
|
||||
// clears another caller's claim.
|
||||
Release(ctx context.Context, gameID, token string) error
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
)
|
||||
|
||||
// HealthEventPublisher emits one entry on the `runtime:health_events`
|
||||
// Redis Stream and updates `health_snapshots` with the latest observation
|
||||
// for the affected game. Adapters publish and snapshot in one call so
|
||||
// every emission durably advances both surfaces; partial publishes (event
|
||||
// without snapshot, or vice versa) are not allowed.
|
||||
//
|
||||
// The start service emits `container_started` through this port; the
|
||||
// periodic Docker inspect, the active probe, and the Docker events
|
||||
// listener publish the rest of the event types through the same port
|
||||
// without changing its surface.
|
||||
type HealthEventPublisher interface {
|
||||
// Publish records envelope on the configured `runtime:health_events`
|
||||
// stream and upserts the matching `health_snapshots` row. A non-nil
|
||||
// error reports a transport or storage failure; the caller treats it
|
||||
// as a degraded emission per `rtmanager/README.md §Notification
|
||||
// Contracts` (the underlying business state is the source of truth,
|
||||
// not the event stream).
|
||||
Publish(ctx context.Context, envelope HealthEventEnvelope) error
|
||||
}
|
||||
|
||||
// HealthEventEnvelope carries the payload published on
|
||||
// `runtime:health_events`. The fields mirror the AsyncAPI schema frozen
|
||||
// in `rtmanager/api/runtime-health-asyncapi.yaml`; adapters serialise
|
||||
// every field verbatim so consumers see the contracted shape.
|
||||
type HealthEventEnvelope struct {
|
||||
// GameID identifies the platform game the event refers to.
|
||||
GameID string
|
||||
|
||||
// ContainerID identifies the Docker container observed by the event
|
||||
// source. May differ from the record's current container id after a
|
||||
// restart race; consumers are expected to treat the value as the
|
||||
// observation's container, not the record's.
|
||||
ContainerID string
|
||||
|
||||
// EventType classifies the event per the frozen vocabulary in
|
||||
// `galaxy/rtmanager/internal/domain/health.EventType`.
|
||||
EventType health.EventType
|
||||
|
||||
// OccurredAt stores the wall-clock at which Runtime Manager observed
|
||||
// the event. Adapters convert it to UTC milliseconds for the wire
|
||||
// payload (`occurred_at_ms`).
|
||||
OccurredAt time.Time
|
||||
|
||||
// Details stores the event-type-specific JSON payload. Adapters
|
||||
// persist and stream it verbatim; nil and empty values are treated as
|
||||
// the canonical empty-object payload.
|
||||
Details json.RawMessage
|
||||
}
|
||||
|
||||
// Validate reports whether envelope satisfies the structural invariants
|
||||
// implied by the AsyncAPI schema.
|
||||
func (envelope HealthEventEnvelope) Validate() error {
|
||||
if strings.TrimSpace(envelope.GameID) == "" {
|
||||
return fmt.Errorf("health event envelope: game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(envelope.ContainerID) == "" {
|
||||
return fmt.Errorf("health event envelope: container id must not be empty")
|
||||
}
|
||||
if !envelope.EventType.IsKnown() {
|
||||
return fmt.Errorf("health event envelope: event type %q is unsupported", envelope.EventType)
|
||||
}
|
||||
if envelope.OccurredAt.IsZero() {
|
||||
return fmt.Errorf("health event envelope: occurred at must not be zero")
|
||||
}
|
||||
if len(envelope.Details) > 0 && !json.Valid(envelope.Details) {
|
||||
return fmt.Errorf("health event envelope: details must be valid JSON when non-empty")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
)
|
||||
|
||||
// HealthSnapshotStore stores the latest technical-health observation per
|
||||
// game. Adapters keep one row per game_id; later observations overwrite.
|
||||
type HealthSnapshotStore interface {
|
||||
// Upsert installs snapshot as the latest observation for
|
||||
// snapshot.GameID. Adapters validate snapshot through
|
||||
// health.HealthSnapshot.Validate before touching the store.
|
||||
Upsert(ctx context.Context, snapshot health.HealthSnapshot) error
|
||||
|
||||
// Get returns the latest snapshot for gameID. It returns
|
||||
// runtime.ErrNotFound (declared in
|
||||
// `galaxy/rtmanager/internal/domain/runtime`) when no snapshot has
|
||||
// been recorded yet.
|
||||
Get(ctx context.Context, gameID string) (health.HealthSnapshot, error)
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// JobResultPublisher emits one entry on the `runtime:job_results` Redis
|
||||
// Stream per finalised start or stop runtime job. Adapters serialise
|
||||
// every JobResult field verbatim so consumers (Game Lobby's
|
||||
// runtime-job-result worker today, future services tomorrow) see the
|
||||
// AsyncAPI shape frozen in `rtmanager/api/runtime-jobs-asyncapi.yaml`.
|
||||
//
|
||||
// The start-jobs and stop-jobs consumers publish through this port.
|
||||
// The synchronous REST handlers do not — REST callers receive the same
|
||||
// `Result` shape directly from the service layer.
|
||||
type JobResultPublisher interface {
|
||||
// Publish records result on the configured `runtime:job_results`
|
||||
// stream. A non-nil error reports a transport or serialisation
|
||||
// failure; the caller treats the failure as a degraded emission
|
||||
// (the operation_log already records the durable outcome).
|
||||
Publish(ctx context.Context, result JobResult) error
|
||||
}
|
||||
|
||||
// JobResult outcome values frozen by the
|
||||
// `RuntimeJobResultPayload.outcome` enum.
|
||||
const (
|
||||
// JobOutcomeSuccess marks a successful start or stop, including the
|
||||
// idempotent replay variant (`error_code=replay_no_op`).
|
||||
JobOutcomeSuccess = "success"
|
||||
|
||||
// JobOutcomeFailure marks a stable failure for which the payload
|
||||
// carries a non-empty `error_code`.
|
||||
JobOutcomeFailure = "failure"
|
||||
)
|
||||
|
||||
// JobResult carries the wire payload published on
|
||||
// `runtime:job_results`. The fields mirror the AsyncAPI schema frozen
|
||||
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`; adapters serialise
|
||||
// every field verbatim so consumers see the contracted shape. Fields
|
||||
// that are required by the contract (every field on this struct) are
|
||||
// always present in the wire entry — even when their string value is
|
||||
// empty (allowed for `container_id` / `engine_endpoint` / `error_code`
|
||||
// / `error_message` on appropriate variants).
|
||||
type JobResult struct {
|
||||
// GameID identifies the platform game the job acted on. Required.
|
||||
GameID string
|
||||
|
||||
// Outcome reports the high-level outcome. Must be `success` or
|
||||
// `failure` (use the JobOutcome* constants).
|
||||
Outcome string
|
||||
|
||||
// ContainerID stores the Docker container id. Populated on
|
||||
// `success` for fresh starts and replays; empty on `failure` and
|
||||
// on `success/replay_no_op` for stop jobs that observed a removed
|
||||
// record.
|
||||
ContainerID string
|
||||
|
||||
// EngineEndpoint stores the stable engine URL
|
||||
// `http://galaxy-game-{game_id}:8080`. Populated alongside
|
||||
// ContainerID, empty in the same cases.
|
||||
EngineEndpoint string
|
||||
|
||||
// ErrorCode stores the stable error code from
|
||||
// `rtmanager/README.md §Error Model`. Empty for fresh successes,
|
||||
// `replay_no_op` for idempotent replays, one of the failure
|
||||
// codes otherwise.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail. Empty for
|
||||
// successes; populated alongside ErrorCode on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Validate reports whether result satisfies the structural invariants
|
||||
// implied by the AsyncAPI schema: a non-empty game id and one of the
|
||||
// two known outcome values. The remaining fields are required to be
|
||||
// present on the wire but may be empty strings, so Validate does not
|
||||
// constrain them.
|
||||
func (result JobResult) Validate() error {
|
||||
if strings.TrimSpace(result.GameID) == "" {
|
||||
return fmt.Errorf("job result: game id must not be empty")
|
||||
}
|
||||
switch result.Outcome {
|
||||
case JobOutcomeSuccess, JobOutcomeFailure:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("job result: outcome %q is unsupported", result.Outcome)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
)
|
||||
|
||||
// LobbyInternalClient is the synchronous trusted-REST port Runtime
|
||||
// Manager uses to read ancillary game metadata from Game Lobby. Stage
|
||||
// 13 calls GetGame purely for diagnostic context; the start envelope
|
||||
// already carries the only required field (`image_ref`) so a
|
||||
// LobbyInternalClient failure must not abort the start operation.
|
||||
type LobbyInternalClient interface {
|
||||
// GetGame returns the Lobby game record for gameID. It returns
|
||||
// ErrLobbyGameNotFound when no record exists and ErrLobbyUnavailable
|
||||
// for transport / timeout / non-2xx responses.
|
||||
GetGame(ctx context.Context, gameID string) (LobbyGameRecord, error)
|
||||
}
|
||||
|
||||
// LobbyGameRecord stores the subset of the Lobby `GameRecord` schema
|
||||
// Runtime Manager uses. The shape is intentionally minimal: this fetch
|
||||
// is ancillary diagnostics and v1 has no required field. The struct
|
||||
// may be extended additively without breaking existing callers.
|
||||
type LobbyGameRecord struct {
|
||||
// GameID identifies the platform game.
|
||||
GameID string
|
||||
|
||||
// Status stores the verbatim Lobby status string (e.g. `starting`,
|
||||
// `running`, `paused`). Runtime Manager does not interpret it; it
|
||||
// is exposed for log enrichment and diagnostics only.
|
||||
Status string
|
||||
|
||||
// TargetEngineVersion stores the semver of the engine version Lobby
|
||||
// resolved into the start envelope's image_ref. Empty when Lobby
|
||||
// did not return one.
|
||||
TargetEngineVersion string
|
||||
}
|
||||
|
||||
// ErrLobbyGameNotFound reports that the Lobby internal API returned 404
|
||||
// for the requested game id.
|
||||
var ErrLobbyGameNotFound = errors.New("lobby game not found")
|
||||
|
||||
// ErrLobbyUnavailable reports that the Lobby internal API could not be
|
||||
// reached (transport error, timeout, non-2xx response). Callers must
|
||||
// treat the failure as recoverable: Runtime Manager continues the
|
||||
// operation when the call is purely diagnostic.
|
||||
var ErrLobbyUnavailable = errors.New("lobby internal api unavailable")
|
||||
@@ -0,0 +1,25 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
)
|
||||
|
||||
// NotificationIntentPublisher is the producer port Runtime Manager uses
|
||||
// to publish admin-only notification intents to Notification Service.
|
||||
// The production adapter is a thin wrapper around
|
||||
// `notificationintent.Publisher`; the wrapper drops the entry id
|
||||
// returned by the underlying publisher because Runtime Manager does
|
||||
// not track per-intent ids in v1.
|
||||
//
|
||||
// A failed Publish call is a notification degradation per
|
||||
// `galaxy/rtmanager/README.md §Notification Contracts` and must not roll
|
||||
// back already committed business state. Callers log the error and
|
||||
// proceed.
|
||||
type NotificationIntentPublisher interface {
|
||||
// Publish normalises intent and appends it to the configured Redis
|
||||
// Stream. Validation failures and transport errors are returned
|
||||
// verbatim.
|
||||
Publish(ctx context.Context, intent notificationintent.Intent) error
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
)
|
||||
|
||||
// OperationLogStore stores append-only audit entries for every
|
||||
// lifecycle operation Runtime Manager performed against a game's
|
||||
// runtime. Adapters must persist entry verbatim and return the
|
||||
// generated bigserial id from Append.
|
||||
type OperationLogStore interface {
|
||||
// Append inserts entry into the operation log and returns the
|
||||
// generated bigserial id. Adapters validate entry through
|
||||
// operation.OperationEntry.Validate before touching the store.
|
||||
Append(ctx context.Context, entry operation.OperationEntry) (id int64, err error)
|
||||
|
||||
// ListByGame returns the most recent entries for gameID, ordered by
|
||||
// started_at descending and capped by limit. A non-positive limit
|
||||
// is rejected as invalid input by adapters.
|
||||
ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error)
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
// Package ports defines the stable interfaces that connect Runtime
|
||||
// Manager use cases to external state and external services.
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
)
|
||||
|
||||
// RuntimeRecordStore stores runtime records and exposes the operations
|
||||
// used by the service layer (Stages 13+) and the workers (Stages 15-18).
|
||||
// Adapters must preserve domain semantics:
|
||||
//
|
||||
// - Get returns runtime.ErrNotFound when no record exists for gameID.
|
||||
// - Upsert installs a record verbatim; the caller is responsible for
|
||||
// domain validation through runtime.RuntimeRecord.Validate.
|
||||
// - UpdateStatus applies one transition through a compare-and-swap
|
||||
// guard on (status, current_container_id) and returns
|
||||
// runtime.ErrConflict on a stale CAS.
|
||||
// - List returns every record currently stored, regardless of status.
|
||||
// - ListByStatus returns every record currently indexed under status.
|
||||
type RuntimeRecordStore interface {
|
||||
// Get returns the record identified by gameID. It returns
|
||||
// runtime.ErrNotFound when no record exists.
|
||||
Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error)
|
||||
|
||||
// Upsert inserts record when no row exists for record.GameID and
|
||||
// otherwise overwrites every column verbatim. The start service uses
|
||||
// Upsert to install fresh records on start, the inner start of
|
||||
// restart and patch, and the reconcile_adopt path.
|
||||
Upsert(ctx context.Context, record runtime.RuntimeRecord) error
|
||||
|
||||
// UpdateStatus applies one status transition in a compare-and-swap
|
||||
// fashion. The adapter must first call runtime.Transition to reject
|
||||
// invalid pairs without touching the store, then verify that the
|
||||
// stored status equals input.ExpectedFrom, and (when
|
||||
// input.ExpectedContainerID is non-empty) that the stored
|
||||
// current_container_id equals it. The adapter derives stopped_at /
|
||||
// removed_at and updates last_op_at from input.Now per the
|
||||
// destination status.
|
||||
UpdateStatus(ctx context.Context, input UpdateStatusInput) error
|
||||
|
||||
// List returns every runtime record currently stored. Used by the
|
||||
// internal REST list endpoint; the v1 working set is bounded by the
|
||||
// games tracked by Lobby and is small enough to return in one
|
||||
// response (pagination is not supported). The order is
|
||||
// adapter-defined; callers may reorder as needed.
|
||||
List(ctx context.Context) ([]runtime.RuntimeRecord, error)
|
||||
|
||||
// ListByStatus returns every record currently indexed under status.
|
||||
// The order is adapter-defined; callers may reorder as needed.
|
||||
ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error)
|
||||
}
|
||||
|
||||
// UpdateStatusInput stores the arguments required to apply one status
|
||||
// transition through a RuntimeRecordStore. The adapter is responsible
|
||||
// for translating the destination status into the matching column
|
||||
// updates (stopped_at / removed_at / current_container_id NULLing) and
|
||||
// for the CAS guard.
|
||||
type UpdateStatusInput struct {
|
||||
// GameID identifies the record to mutate.
|
||||
GameID string
|
||||
|
||||
// ExpectedFrom stores the status the caller believes the record
|
||||
// currently has. A mismatch results in runtime.ErrConflict.
|
||||
ExpectedFrom runtime.Status
|
||||
|
||||
// ExpectedContainerID is an optional CAS guard. When non-empty, the
|
||||
// adapter rejects the update with runtime.ErrConflict if the stored
|
||||
// current_container_id does not equal it. Used by stop / cleanup /
|
||||
// reconcile to protect against concurrent restart races. Empty
|
||||
// disables the container-id CAS while keeping the status CAS.
|
||||
ExpectedContainerID string
|
||||
|
||||
// To stores the destination status.
|
||||
To runtime.Status
|
||||
|
||||
// Now stores the wall-clock used to derive stopped_at / removed_at
|
||||
// and last_op_at depending on To.
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether input contains a structurally valid status
|
||||
// transition request. Adapters call Validate before touching the store.
|
||||
func (input UpdateStatusInput) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("update runtime status: game id must not be empty")
|
||||
}
|
||||
if !input.ExpectedFrom.IsKnown() {
|
||||
return fmt.Errorf(
|
||||
"update runtime status: expected from status %q is unsupported",
|
||||
input.ExpectedFrom,
|
||||
)
|
||||
}
|
||||
if !input.To.IsKnown() {
|
||||
return fmt.Errorf(
|
||||
"update runtime status: to status %q is unsupported",
|
||||
input.To,
|
||||
)
|
||||
}
|
||||
if err := runtime.Transition(input.ExpectedFrom, input.To); err != nil {
|
||||
return fmt.Errorf("update runtime status: %w", err)
|
||||
}
|
||||
if input.Now.IsZero() {
|
||||
return fmt.Errorf("update runtime status: now must not be zero")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func validUpdateStatusInput() UpdateStatusInput {
|
||||
return UpdateStatusInput{
|
||||
GameID: "game-test",
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: "container-1",
|
||||
To: runtime.StatusStopped,
|
||||
Now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateHappy(t *testing.T) {
|
||||
require.NoError(t, validUpdateStatusInput().Validate())
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateAcceptsEmptyContainerCAS(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
input.ExpectedContainerID = ""
|
||||
|
||||
assert.NoError(t, input.Validate())
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateRejects(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*UpdateStatusInput)
|
||||
}{
|
||||
{"empty game id", func(i *UpdateStatusInput) { i.GameID = "" }},
|
||||
{"unknown expected from", func(i *UpdateStatusInput) {
|
||||
i.ExpectedFrom = "exotic"
|
||||
}},
|
||||
{"unknown to", func(i *UpdateStatusInput) {
|
||||
i.To = "exotic"
|
||||
}},
|
||||
{"zero now", func(i *UpdateStatusInput) {
|
||||
i.Now = time.Time{}
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
tt.mutate(&input)
|
||||
assert.Error(t, input.Validate())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateRejectsForbiddenTransition(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
input.ExpectedFrom = runtime.StatusRemoved
|
||||
input.To = runtime.StatusRunning
|
||||
|
||||
err := input.Validate()
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, runtime.ErrInvalidTransition),
|
||||
"want runtime.ErrInvalidTransition, got %v", err)
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package ports
|
||||
|
||||
import "context"
|
||||
|
||||
// StreamOffsetStore persists the last successfully processed Redis
|
||||
// Stream entry id per consumer label. Workers call Load on startup to
|
||||
// resume from the persisted offset and Save after every successful
|
||||
// message handling so the next iteration advances past the
|
||||
// just-processed entry. The label is the short logical identifier of
|
||||
// the consumer (e.g. `start_jobs`, `stop_jobs`), not the full stream
|
||||
// name; it stays stable when the underlying stream key is renamed.
|
||||
type StreamOffsetStore interface {
|
||||
// Load returns the last processed entry id for the consumer
|
||||
// labelled stream when one is stored. The boolean return reports
|
||||
// whether a value was present; implementations must not return an
|
||||
// error for a missing key.
|
||||
Load(ctx context.Context, stream string) (entryID string, found bool, err error)
|
||||
|
||||
// Save stores entryID as the new last processed offset for the
|
||||
// consumer labelled stream. Implementations overwrite any previous
|
||||
// value unconditionally.
|
||||
Save(ctx context.Context, stream, entryID string) error
|
||||
}
|
||||
Reference in New Issue
Block a user