feat: runtime manager
This commit is contained in:
@@ -0,0 +1,493 @@
|
||||
// Package docker provides the production Docker SDK adapter that
|
||||
// implements `galaxy/rtmanager/internal/ports.DockerClient`. The
|
||||
// adapter is the single component allowed to talk to the local Docker
|
||||
// daemon; every Runtime Manager service path that needs container
|
||||
// lifecycle operations goes through this surface.
|
||||
//
|
||||
// The adapter is intentionally narrow — it does not orchestrate, log,
|
||||
// or retry. Cross-cutting concerns (lease coordination, durable state,
|
||||
// notification side-effects) live in the service layer.
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"maps"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cerrdefs "github.com/containerd/errdefs"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/events"
|
||||
"github.com/docker/docker/api/types/filters"
|
||||
"github.com/docker/docker/api/types/image"
|
||||
"github.com/docker/docker/api/types/network"
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/docker/go-units"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
// EnginePort is the in-container HTTP port the engine listens on. The
|
||||
// value is fixed by `rtmanager/README.md §Container Model` and by the
|
||||
// engine's Dockerfile (`game/Dockerfile`); RTM never publishes the port
|
||||
// to the host. Keeping the constant here lets the adapter own the URL
|
||||
// shape so the start service does not have to know it.
|
||||
const EnginePort = 8080
|
||||
|
||||
// Config groups the dependencies and per-process defaults required to
|
||||
// construct a Client. The struct is value-typed so wiring code can
|
||||
// build it inline without intermediate variables.
|
||||
type Config struct {
|
||||
// Docker stores the SDK client this adapter wraps. It must be
|
||||
// non-nil; callers typically construct it via `client.NewClientWithOpts`.
|
||||
Docker *dockerclient.Client
|
||||
|
||||
// LogDriver stores the Docker logging driver applied to every
|
||||
// container the adapter creates (e.g. `json-file`).
|
||||
LogDriver string
|
||||
|
||||
// LogOpts stores the comma-separated `key=value` driver options
|
||||
// forwarded to Docker. Empty disables driver-specific options.
|
||||
LogOpts string
|
||||
|
||||
// Clock supplies the wall-clock used for `RunResult.StartedAt`.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
}
|
||||
|
||||
// Client is the production adapter implementing `ports.DockerClient`.
|
||||
// Construct it via NewClient; do not zero-initialise.
|
||||
type Client struct {
|
||||
docker *dockerclient.Client
|
||||
logDriver string
|
||||
logOpts string
|
||||
clock func() time.Time
|
||||
}
|
||||
|
||||
// NewClient constructs a Client from cfg. It returns an error if cfg
|
||||
// does not carry the minimum collaborator set the adapter needs to
|
||||
// function.
|
||||
func NewClient(cfg Config) (*Client, error) {
|
||||
if cfg.Docker == nil {
|
||||
return nil, errors.New("new docker adapter: nil docker client")
|
||||
}
|
||||
if strings.TrimSpace(cfg.LogDriver) == "" {
|
||||
return nil, errors.New("new docker adapter: log driver must not be empty")
|
||||
}
|
||||
clock := cfg.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
return &Client{
|
||||
docker: cfg.Docker,
|
||||
logDriver: cfg.LogDriver,
|
||||
logOpts: cfg.LogOpts,
|
||||
clock: clock,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// EnsureNetwork verifies the user-defined Docker network is present.
|
||||
// The adapter never creates networks; provisioning is the operator's
|
||||
// job per `rtmanager/README.md §Container Model`.
|
||||
func (client *Client) EnsureNetwork(ctx context.Context, name string) error {
|
||||
if _, err := client.docker.NetworkInspect(ctx, name, network.InspectOptions{}); err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return ports.ErrNetworkMissing
|
||||
}
|
||||
return fmt.Errorf("ensure network %q: %w", name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PullImage pulls ref according to policy. The pull stream is drained
|
||||
// to completion because the Docker SDK only finishes the underlying
|
||||
// pull when the body is consumed.
|
||||
func (client *Client) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error {
|
||||
if !policy.IsKnown() {
|
||||
return fmt.Errorf("pull image %q: unknown pull policy %q", ref, policy)
|
||||
}
|
||||
switch policy {
|
||||
case ports.PullPolicyAlways:
|
||||
return client.runPull(ctx, ref)
|
||||
case ports.PullPolicyIfMissing:
|
||||
if present, err := client.imagePresent(ctx, ref); err != nil {
|
||||
return err
|
||||
} else if present {
|
||||
return nil
|
||||
}
|
||||
return client.runPull(ctx, ref)
|
||||
case ports.PullPolicyNever:
|
||||
present, err := client.imagePresent(ctx, ref)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !present {
|
||||
return ports.ErrImageNotFound
|
||||
}
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("pull image %q: unsupported pull policy %q", ref, policy)
|
||||
}
|
||||
}
|
||||
|
||||
// InspectImage returns image metadata for ref. RTM only reads labels
|
||||
// at start time; the broader inspect struct stays accessible for
|
||||
// diagnostics.
|
||||
func (client *Client) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) {
|
||||
inspect, err := client.docker.ImageInspect(ctx, ref)
|
||||
if err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return ports.ImageInspect{}, ports.ErrImageNotFound
|
||||
}
|
||||
return ports.ImageInspect{}, fmt.Errorf("inspect image %q: %w", ref, err)
|
||||
}
|
||||
var labels map[string]string
|
||||
if inspect.Config != nil {
|
||||
labels = copyStringMap(inspect.Config.Labels)
|
||||
}
|
||||
return ports.ImageInspect{Ref: ref, Labels: labels}, nil
|
||||
}
|
||||
|
||||
// InspectContainer returns container metadata for containerID. The
|
||||
// adapter best-effort decodes Docker timestamps; malformed values map
|
||||
// to the zero time so callers do not have to defend against nil
|
||||
// pointers in the SDK response.
|
||||
func (client *Client) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) {
|
||||
inspect, err := client.docker.ContainerInspect(ctx, containerID)
|
||||
if err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return ports.ContainerInspect{}, ports.ErrContainerNotFound
|
||||
}
|
||||
return ports.ContainerInspect{}, fmt.Errorf("inspect container %q: %w", containerID, err)
|
||||
}
|
||||
|
||||
result := ports.ContainerInspect{ID: inspect.ID}
|
||||
if inspect.ContainerJSONBase != nil {
|
||||
result.RestartCount = inspect.RestartCount
|
||||
if inspect.State != nil {
|
||||
result.Status = string(inspect.State.Status)
|
||||
result.OOMKilled = inspect.State.OOMKilled
|
||||
result.ExitCode = inspect.State.ExitCode
|
||||
result.StartedAt = parseDockerTime(inspect.State.StartedAt)
|
||||
result.FinishedAt = parseDockerTime(inspect.State.FinishedAt)
|
||||
if inspect.State.Health != nil {
|
||||
result.Health = string(inspect.State.Health.Status)
|
||||
}
|
||||
}
|
||||
}
|
||||
if inspect.Config != nil {
|
||||
result.ImageRef = inspect.Config.Image
|
||||
result.Hostname = inspect.Config.Hostname
|
||||
result.Labels = copyStringMap(inspect.Config.Labels)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Run creates and starts one container according to spec. On
|
||||
// `ContainerStart` failure the adapter best-effort removes the partial
|
||||
// container so the start service never has to clean up after a failed
|
||||
// start path.
|
||||
func (client *Client) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) {
|
||||
if err := spec.Validate(); err != nil {
|
||||
return ports.RunResult{}, fmt.Errorf("run container: %w", err)
|
||||
}
|
||||
memoryBytes, err := units.RAMInBytes(spec.Memory)
|
||||
if err != nil {
|
||||
return ports.RunResult{}, fmt.Errorf("run container %q: parse memory %q: %w", spec.Name, spec.Memory, err)
|
||||
}
|
||||
pidsLimit := int64(spec.PIDsLimit)
|
||||
|
||||
containerCfg := &container.Config{
|
||||
Image: spec.Image,
|
||||
Hostname: spec.Hostname,
|
||||
Env: envMapToSlice(spec.Env),
|
||||
Labels: copyStringMap(spec.Labels),
|
||||
Cmd: append([]string(nil), spec.Cmd...),
|
||||
}
|
||||
hostCfg := &container.HostConfig{
|
||||
Binds: bindMountsToBinds(spec.BindMounts),
|
||||
LogConfig: container.LogConfig{
|
||||
Type: client.logDriver,
|
||||
Config: parseLogOpts(client.logOpts),
|
||||
},
|
||||
Resources: container.Resources{
|
||||
NanoCPUs: int64(spec.CPUQuota * 1e9),
|
||||
Memory: memoryBytes,
|
||||
PidsLimit: &pidsLimit,
|
||||
},
|
||||
}
|
||||
netCfg := &network.NetworkingConfig{
|
||||
EndpointsConfig: map[string]*network.EndpointSettings{
|
||||
spec.Network: {
|
||||
Aliases: []string{spec.Hostname},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
created, err := client.docker.ContainerCreate(ctx, containerCfg, hostCfg, netCfg, nil, spec.Name)
|
||||
if err != nil {
|
||||
return ports.RunResult{}, fmt.Errorf("create container %q: %w", spec.Name, err)
|
||||
}
|
||||
|
||||
if err := client.docker.ContainerStart(ctx, created.ID, container.StartOptions{}); err != nil {
|
||||
client.cleanupAfterFailedStart(created.ID)
|
||||
return ports.RunResult{}, fmt.Errorf("start container %q: %w", spec.Name, err)
|
||||
}
|
||||
|
||||
return ports.RunResult{
|
||||
ContainerID: created.ID,
|
||||
EngineEndpoint: fmt.Sprintf("http://%s:%d", spec.Hostname, EnginePort),
|
||||
StartedAt: client.clock(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Stop bounds graceful shutdown by timeout. A missing container is
|
||||
// surfaced as ErrContainerNotFound so the service layer can treat it
|
||||
// as already-stopped per `rtmanager/README.md §Lifecycles → Stop`.
|
||||
func (client *Client) Stop(ctx context.Context, containerID string, timeout time.Duration) error {
|
||||
seconds := max(int(timeout.Round(time.Second).Seconds()), 0)
|
||||
if err := client.docker.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &seconds}); err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return ports.ErrContainerNotFound
|
||||
}
|
||||
return fmt.Errorf("stop container %q: %w", containerID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Remove removes the container without forcing kill. A missing
|
||||
// container is reported as success so callers can treat the operation
|
||||
// as idempotent.
|
||||
func (client *Client) Remove(ctx context.Context, containerID string) error {
|
||||
if err := client.docker.ContainerRemove(ctx, containerID, container.RemoveOptions{}); err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("remove container %q: %w", containerID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// List returns container summaries that match filter. Empty Labels
|
||||
// match every container; the reconciler always passes
|
||||
// `com.galaxy.owner=rtmanager`.
|
||||
func (client *Client) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) {
|
||||
args := filters.NewArgs()
|
||||
for key, value := range filter.Labels {
|
||||
args.Add("label", key+"="+value)
|
||||
}
|
||||
summaries, err := client.docker.ContainerList(ctx, container.ListOptions{All: true, Filters: args})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list containers: %w", err)
|
||||
}
|
||||
out := make([]ports.ContainerSummary, 0, len(summaries))
|
||||
for _, summary := range summaries {
|
||||
hostname := ""
|
||||
if len(summary.Names) > 0 {
|
||||
hostname = strings.TrimPrefix(summary.Names[0], "/")
|
||||
}
|
||||
out = append(out, ports.ContainerSummary{
|
||||
ID: summary.ID,
|
||||
ImageRef: summary.Image,
|
||||
Hostname: hostname,
|
||||
Labels: copyStringMap(summary.Labels),
|
||||
Status: string(summary.State),
|
||||
StartedAt: time.Unix(summary.Created, 0).UTC(),
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// EventsListen subscribes to the Docker events stream and returns a
|
||||
// typed channel of decoded container events plus an asynchronous
|
||||
// error channel. The caller cancels ctx to terminate the subscription;
|
||||
// the goroutine closes both channels on termination.
|
||||
func (client *Client) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
|
||||
msgs, sdkErrs := client.docker.Events(ctx, events.ListOptions{})
|
||||
out := make(chan ports.DockerEvent)
|
||||
outErrs := make(chan error, 1)
|
||||
|
||||
var closeOnce sync.Once
|
||||
closeAll := func() {
|
||||
closeOnce.Do(func() {
|
||||
close(out)
|
||||
close(outErrs)
|
||||
})
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer closeAll()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case msg, ok := <-msgs:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if msg.Type != events.ContainerEventType {
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case out <- decodeEvent(msg):
|
||||
}
|
||||
case err, ok := <-sdkErrs:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if err == nil {
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case outErrs <- err:
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return out, outErrs, nil
|
||||
}
|
||||
|
||||
func (client *Client) cleanupAfterFailedStart(containerID string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
_ = client.docker.ContainerRemove(cleanupCtx, containerID, container.RemoveOptions{Force: true})
|
||||
}
|
||||
|
||||
func (client *Client) imagePresent(ctx context.Context, ref string) (bool, error) {
|
||||
if _, err := client.docker.ImageInspect(ctx, ref); err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return false, nil
|
||||
}
|
||||
return false, fmt.Errorf("inspect image %q: %w", ref, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (client *Client) runPull(ctx context.Context, ref string) error {
|
||||
body, err := client.docker.ImagePull(ctx, ref, image.PullOptions{})
|
||||
if err != nil {
|
||||
if cerrdefs.IsNotFound(err) {
|
||||
return ports.ErrImageNotFound
|
||||
}
|
||||
return fmt.Errorf("pull image %q: %w", ref, err)
|
||||
}
|
||||
defer body.Close()
|
||||
if _, err := io.Copy(io.Discard, body); err != nil {
|
||||
return fmt.Errorf("drain pull stream for %q: %w", ref, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func envMapToSlice(envMap map[string]string) []string {
|
||||
if len(envMap) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]string, 0, len(envMap))
|
||||
for key, value := range envMap {
|
||||
out = append(out, key+"="+value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func bindMountsToBinds(mounts []ports.BindMount) []string {
|
||||
if len(mounts) == 0 {
|
||||
return nil
|
||||
}
|
||||
binds := make([]string, 0, len(mounts))
|
||||
for _, mount := range mounts {
|
||||
bind := mount.HostPath + ":" + mount.MountPath
|
||||
if mount.ReadOnly {
|
||||
bind += ":ro"
|
||||
}
|
||||
binds = append(binds, bind)
|
||||
}
|
||||
return binds
|
||||
}
|
||||
|
||||
func parseLogOpts(raw string) map[string]string {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]string)
|
||||
for part := range strings.SplitSeq(raw, ",") {
|
||||
entry := strings.TrimSpace(part)
|
||||
if entry == "" {
|
||||
continue
|
||||
}
|
||||
index := strings.IndexByte(entry, '=')
|
||||
if index <= 0 {
|
||||
continue
|
||||
}
|
||||
out[entry[:index]] = entry[index+1:]
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func parseDockerTime(raw string) time.Time {
|
||||
if raw == "" {
|
||||
return time.Time{}
|
||||
}
|
||||
parsed, err := time.Parse(time.RFC3339Nano, raw)
|
||||
if err != nil {
|
||||
return time.Time{}
|
||||
}
|
||||
return parsed.UTC()
|
||||
}
|
||||
|
||||
func copyStringMap(in map[string]string) map[string]string {
|
||||
if in == nil {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]string, len(in))
|
||||
maps.Copy(out, in)
|
||||
return out
|
||||
}
|
||||
|
||||
func decodeEvent(msg events.Message) ports.DockerEvent {
|
||||
occurredAt := time.Time{}
|
||||
switch {
|
||||
case msg.TimeNano != 0:
|
||||
occurredAt = time.Unix(0, msg.TimeNano).UTC()
|
||||
case msg.Time != 0:
|
||||
occurredAt = time.Unix(msg.Time, 0).UTC()
|
||||
}
|
||||
exitCode := 0
|
||||
if raw, ok := msg.Actor.Attributes["exitCode"]; ok {
|
||||
if value, err := parseExitCode(raw); err == nil {
|
||||
exitCode = value
|
||||
}
|
||||
}
|
||||
return ports.DockerEvent{
|
||||
Action: string(msg.Action),
|
||||
ContainerID: msg.Actor.ID,
|
||||
Labels: copyStringMap(msg.Actor.Attributes),
|
||||
ExitCode: exitCode,
|
||||
OccurredAt: occurredAt,
|
||||
}
|
||||
}
|
||||
|
||||
func parseExitCode(raw string) (int, error) {
|
||||
value := 0
|
||||
for _, r := range raw {
|
||||
if r < '0' || r > '9' {
|
||||
return 0, fmt.Errorf("non-numeric exit code %q", raw)
|
||||
}
|
||||
value = value*10 + int(r-'0')
|
||||
}
|
||||
return value, nil
|
||||
}
|
||||
|
||||
// Compile-time assertion: Client implements ports.DockerClient.
|
||||
var _ ports.DockerClient = (*Client)(nil)
|
||||
@@ -0,0 +1,561 @@
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
// newTestClient wires an httptest.Server backed Docker SDK client to our
|
||||
// adapter. The handler is invoked for every Docker API request issued
|
||||
// during the test; tests assert on path and method to route the
|
||||
// response.
|
||||
func newTestClient(t *testing.T, handler http.HandlerFunc) *Client {
|
||||
t.Helper()
|
||||
server := httptest.NewServer(handler)
|
||||
t.Cleanup(server.Close)
|
||||
|
||||
docker, err := dockerclient.NewClientWithOpts(
|
||||
dockerclient.WithHost(server.URL),
|
||||
dockerclient.WithHTTPClient(server.Client()),
|
||||
dockerclient.WithVersion("1.45"),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() { _ = docker.Close() })
|
||||
|
||||
client, err := NewClient(Config{
|
||||
Docker: docker,
|
||||
LogDriver: "json-file",
|
||||
LogOpts: "max-size=1m,max-file=3",
|
||||
Clock: func() time.Time { return time.Date(2026, time.April, 27, 12, 0, 0, 0, time.UTC) },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return client
|
||||
}
|
||||
|
||||
func writeJSON(t *testing.T, w http.ResponseWriter, status int, body any) {
|
||||
t.Helper()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
require.NoError(t, json.NewEncoder(w).Encode(body))
|
||||
}
|
||||
|
||||
func writeNotFound(t *testing.T, w http.ResponseWriter, msg string) {
|
||||
t.Helper()
|
||||
writeJSON(t, w, http.StatusNotFound, map[string]string{"message": msg})
|
||||
}
|
||||
|
||||
// Docker SDK uses /v1.45 prefix when client is pinned to API 1.45.
|
||||
func dockerPath(suffix string) string {
|
||||
return "/v1.45" + suffix
|
||||
}
|
||||
|
||||
func TestNewClientValidatesConfig(t *testing.T) {
|
||||
t.Run("nil docker client", func(t *testing.T) {
|
||||
_, err := NewClient(Config{LogDriver: "json-file"})
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "nil docker client")
|
||||
})
|
||||
t.Run("empty log driver", func(t *testing.T) {
|
||||
docker, err := dockerclient.NewClientWithOpts(dockerclient.WithHost("tcp://127.0.0.1:65535"))
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() { _ = docker.Close() })
|
||||
_, err = NewClient(Config{Docker: docker, LogDriver: " "})
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "log driver")
|
||||
})
|
||||
}
|
||||
|
||||
func TestEnsureNetwork(t *testing.T) {
|
||||
t.Run("present", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, dockerPath("/networks/galaxy-net"), r.URL.Path)
|
||||
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "net-1", "Name": "galaxy-net"})
|
||||
})
|
||||
require.NoError(t, client.EnsureNetwork(context.Background(), "galaxy-net"))
|
||||
})
|
||||
t.Run("missing", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
writeNotFound(t, w, "no such network")
|
||||
})
|
||||
err := client.EnsureNetwork(context.Background(), "missing")
|
||||
require.Error(t, err)
|
||||
assert.ErrorIs(t, err, ports.ErrNetworkMissing)
|
||||
})
|
||||
t.Run("transport error", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "boom", http.StatusInternalServerError)
|
||||
})
|
||||
err := client.EnsureNetwork(context.Background(), "x")
|
||||
require.Error(t, err)
|
||||
assert.NotErrorIs(t, err, ports.ErrNetworkMissing)
|
||||
})
|
||||
}
|
||||
|
||||
func TestInspectImage(t *testing.T) {
|
||||
t.Run("present", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, dockerPath("/images/galaxy/game:test/json"), r.URL.Path)
|
||||
writeJSON(t, w, http.StatusOK, map[string]any{
|
||||
"Id": "sha256:abc",
|
||||
"Config": map[string]any{
|
||||
"Labels": map[string]string{
|
||||
"com.galaxy.cpu_quota": "1.0",
|
||||
"com.galaxy.memory": "512m",
|
||||
"com.galaxy.pids_limit": "512",
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
got, err := client.InspectImage(context.Background(), "galaxy/game:test")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "galaxy/game:test", got.Ref)
|
||||
assert.Equal(t, "1.0", got.Labels["com.galaxy.cpu_quota"])
|
||||
assert.Equal(t, "512m", got.Labels["com.galaxy.memory"])
|
||||
})
|
||||
t.Run("not found", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
writeNotFound(t, w, "no such image")
|
||||
})
|
||||
_, err := client.InspectImage(context.Background(), "galaxy/missing:tag")
|
||||
require.Error(t, err)
|
||||
assert.ErrorIs(t, err, ports.ErrImageNotFound)
|
||||
})
|
||||
}
|
||||
|
||||
func TestInspectContainer(t *testing.T) {
|
||||
t.Run("present", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, dockerPath("/containers/cont-1/json"), r.URL.Path)
|
||||
writeJSON(t, w, http.StatusOK, map[string]any{
|
||||
"Id": "cont-1",
|
||||
"RestartCount": 2,
|
||||
"State": map[string]any{
|
||||
"Status": "running",
|
||||
"OOMKilled": false,
|
||||
"ExitCode": 0,
|
||||
"StartedAt": "2026-04-27T11:00:00.5Z",
|
||||
"FinishedAt": "0001-01-01T00:00:00Z",
|
||||
"Health": map[string]any{"Status": "healthy"},
|
||||
},
|
||||
"Config": map[string]any{
|
||||
"Image": "galaxy/game:test",
|
||||
"Hostname": "galaxy-game-game-1",
|
||||
"Labels": map[string]string{
|
||||
"com.galaxy.owner": "rtmanager",
|
||||
"com.galaxy.game_id": "game-1",
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
got, err := client.InspectContainer(context.Background(), "cont-1")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cont-1", got.ID)
|
||||
assert.Equal(t, 2, got.RestartCount)
|
||||
assert.Equal(t, "running", got.Status)
|
||||
assert.Equal(t, "healthy", got.Health)
|
||||
assert.Equal(t, "galaxy/game:test", got.ImageRef)
|
||||
assert.Equal(t, "galaxy-game-game-1", got.Hostname)
|
||||
assert.Equal(t, "rtmanager", got.Labels["com.galaxy.owner"])
|
||||
assert.False(t, got.StartedAt.IsZero())
|
||||
})
|
||||
t.Run("not found", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
writeNotFound(t, w, "no such container")
|
||||
})
|
||||
_, err := client.InspectContainer(context.Background(), "missing")
|
||||
require.Error(t, err)
|
||||
assert.ErrorIs(t, err, ports.ErrContainerNotFound)
|
||||
})
|
||||
}
|
||||
|
||||
func TestPullImagePolicies(t *testing.T) {
|
||||
t.Run("if_missing/found skips pull", func(t *testing.T) {
|
||||
hits := struct {
|
||||
inspect atomic.Int32
|
||||
pull atomic.Int32
|
||||
}{}
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet:
|
||||
hits.inspect.Add(1)
|
||||
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "sha256:x"})
|
||||
case strings.Contains(r.URL.Path, "/images/create"):
|
||||
hits.pull.Add(1)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path)
|
||||
}
|
||||
})
|
||||
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing))
|
||||
assert.Equal(t, int32(1), hits.inspect.Load())
|
||||
assert.Equal(t, int32(0), hits.pull.Load())
|
||||
})
|
||||
t.Run("if_missing/absent triggers pull", func(t *testing.T) {
|
||||
hits := struct {
|
||||
inspect atomic.Int32
|
||||
pull atomic.Int32
|
||||
}{}
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case strings.HasSuffix(r.URL.Path, "/json") && r.Method == http.MethodGet:
|
||||
hits.inspect.Add(1)
|
||||
writeNotFound(t, w, "no such image")
|
||||
case strings.Contains(r.URL.Path, "/images/create"):
|
||||
hits.pull.Add(1)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = io.WriteString(w, `{"status":"Pulling..."}`+"\n"+`{"status":"Done"}`+"\n")
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.Path)
|
||||
}
|
||||
})
|
||||
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyIfMissing))
|
||||
assert.Equal(t, int32(1), hits.inspect.Load())
|
||||
assert.Equal(t, int32(1), hits.pull.Load())
|
||||
})
|
||||
t.Run("always pulls regardless of cache", func(t *testing.T) {
|
||||
var pullCount atomic.Int32
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Contains(t, r.URL.Path, "/images/create")
|
||||
pullCount.Add(1)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
})
|
||||
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyAlways))
|
||||
assert.Equal(t, int32(1), pullCount.Load())
|
||||
})
|
||||
t.Run("never with absent image", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
writeNotFound(t, w, "no such image")
|
||||
})
|
||||
err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever)
|
||||
require.Error(t, err)
|
||||
assert.ErrorIs(t, err, ports.ErrImageNotFound)
|
||||
})
|
||||
t.Run("never with present image", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
writeJSON(t, w, http.StatusOK, map[string]any{"Id": "x"})
|
||||
})
|
||||
require.NoError(t, client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicyNever))
|
||||
})
|
||||
t.Run("unknown policy", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
t.Fatal("must not call docker on unknown policy")
|
||||
})
|
||||
err := client.PullImage(context.Background(), "alpine:3.21", ports.PullPolicy("invalid"))
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestRunHappyPath(t *testing.T) {
|
||||
calls := struct {
|
||||
create atomic.Int32
|
||||
start atomic.Int32
|
||||
remove atomic.Int32
|
||||
}{}
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"):
|
||||
calls.create.Add(1)
|
||||
require.Equal(t, "galaxy-game-game-1", r.URL.Query().Get("name"))
|
||||
writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-new", "Warnings": []string{}})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"):
|
||||
calls.start.Add(1)
|
||||
require.Equal(t, dockerPath("/containers/cont-new/start"), r.URL.Path)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/")):
|
||||
calls.remove.Add(1)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
default:
|
||||
t.Fatalf("unexpected %s %s", r.Method, r.URL.Path)
|
||||
}
|
||||
})
|
||||
|
||||
result, err := client.Run(context.Background(), ports.RunSpec{
|
||||
Name: "galaxy-game-game-1",
|
||||
Image: "galaxy/game:test",
|
||||
Hostname: "galaxy-game-game-1",
|
||||
Network: "galaxy-net",
|
||||
Env: map[string]string{
|
||||
"GAME_STATE_PATH": "/var/lib/galaxy-game",
|
||||
"STORAGE_PATH": "/var/lib/galaxy-game",
|
||||
},
|
||||
Labels: map[string]string{"com.galaxy.owner": "rtmanager"},
|
||||
LogDriver: "json-file",
|
||||
BindMounts: []ports.BindMount{
|
||||
{HostPath: "/var/lib/galaxy/games/game-1", MountPath: "/var/lib/galaxy-game"},
|
||||
},
|
||||
CPUQuota: 1.0,
|
||||
Memory: "512m",
|
||||
PIDsLimit: 512,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cont-new", result.ContainerID)
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", result.EngineEndpoint)
|
||||
assert.False(t, result.StartedAt.IsZero())
|
||||
assert.Equal(t, int32(1), calls.create.Load())
|
||||
assert.Equal(t, int32(1), calls.start.Load())
|
||||
assert.Equal(t, int32(0), calls.remove.Load())
|
||||
}
|
||||
|
||||
func TestRunStartFailureRemovesContainer(t *testing.T) {
|
||||
calls := struct {
|
||||
create atomic.Int32
|
||||
start atomic.Int32
|
||||
remove atomic.Int32
|
||||
}{}
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/containers/create"):
|
||||
calls.create.Add(1)
|
||||
writeJSON(t, w, http.StatusCreated, map[string]any{"Id": "cont-x"})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/start"):
|
||||
calls.start.Add(1)
|
||||
http.Error(w, `{"message":"insufficient host resources"}`, http.StatusInternalServerError)
|
||||
case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, dockerPath("/containers/cont-x")):
|
||||
calls.remove.Add(1)
|
||||
require.Equal(t, "1", r.URL.Query().Get("force"))
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
default:
|
||||
t.Fatalf("unexpected %s %s", r.Method, r.URL.Path)
|
||||
}
|
||||
})
|
||||
|
||||
_, err := client.Run(context.Background(), ports.RunSpec{
|
||||
Name: "x",
|
||||
Image: "img",
|
||||
Hostname: "x",
|
||||
Network: "n",
|
||||
LogDriver: "json-file",
|
||||
CPUQuota: 1.0,
|
||||
Memory: "64m",
|
||||
PIDsLimit: 64,
|
||||
})
|
||||
require.Error(t, err)
|
||||
assert.Equal(t, int32(1), calls.create.Load())
|
||||
assert.Equal(t, int32(1), calls.start.Load())
|
||||
assert.Equal(t, int32(1), calls.remove.Load(), "adapter must roll back the partial container")
|
||||
}
|
||||
|
||||
func TestRunRejectsInvalidSpec(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
t.Fatal("must not contact docker on invalid spec")
|
||||
})
|
||||
_, err := client.Run(context.Background(), ports.RunSpec{Name: "x"})
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "image must not be empty")
|
||||
}
|
||||
|
||||
func TestStop(t *testing.T) {
|
||||
t.Run("graceful stop", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodPost, r.Method)
|
||||
require.Equal(t, dockerPath("/containers/cont-1/stop"), r.URL.Path)
|
||||
require.Equal(t, "30", r.URL.Query().Get("t"))
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
})
|
||||
require.NoError(t, client.Stop(context.Background(), "cont-1", 30*time.Second))
|
||||
})
|
||||
t.Run("missing container", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
writeNotFound(t, w, "no such container")
|
||||
})
|
||||
err := client.Stop(context.Background(), "missing", 30*time.Second)
|
||||
assert.ErrorIs(t, err, ports.ErrContainerNotFound)
|
||||
})
|
||||
t.Run("negative timeout normalised to zero", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, "0", r.URL.Query().Get("t"))
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
})
|
||||
require.NoError(t, client.Stop(context.Background(), "x", -5*time.Second))
|
||||
})
|
||||
}
|
||||
|
||||
func TestRemoveIsIdempotent(t *testing.T) {
|
||||
t.Run("present", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodDelete, r.Method)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
})
|
||||
require.NoError(t, client.Remove(context.Background(), "cont-1"))
|
||||
})
|
||||
t.Run("missing", func(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
writeNotFound(t, w, "no such container")
|
||||
})
|
||||
require.NoError(t, client.Remove(context.Background(), "missing"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestListAppliesLabelFilter(t *testing.T) {
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, dockerPath("/containers/json"), r.URL.Path)
|
||||
require.Equal(t, "1", r.URL.Query().Get("all"))
|
||||
|
||||
filtersRaw := r.URL.Query().Get("filters")
|
||||
require.NotEmpty(t, filtersRaw)
|
||||
var args map[string]map[string]bool
|
||||
require.NoError(t, json.Unmarshal([]byte(filtersRaw), &args))
|
||||
require.True(t, args["label"]["com.galaxy.owner=rtmanager"])
|
||||
|
||||
writeJSON(t, w, http.StatusOK, []map[string]any{
|
||||
{
|
||||
"Id": "cont-a",
|
||||
"Image": "galaxy/game:1.2.3",
|
||||
"Names": []string{"/galaxy-game-game-1"},
|
||||
"Labels": map[string]string{"com.galaxy.owner": "rtmanager"},
|
||||
"State": "running",
|
||||
"Created": int64(1700000000),
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
got, err := client.List(context.Background(), ports.ListFilter{
|
||||
Labels: map[string]string{"com.galaxy.owner": "rtmanager"},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, got, 1)
|
||||
assert.Equal(t, "cont-a", got[0].ID)
|
||||
assert.Equal(t, "galaxy/game:1.2.3", got[0].ImageRef)
|
||||
assert.Equal(t, "galaxy-game-game-1", got[0].Hostname)
|
||||
assert.Equal(t, "running", got[0].Status)
|
||||
assert.False(t, got[0].StartedAt.IsZero())
|
||||
assert.Equal(t, "rtmanager", got[0].Labels["com.galaxy.owner"])
|
||||
}
|
||||
|
||||
func TestEventsListenDecodesContainerEvents(t *testing.T) {
|
||||
mu := make(chan struct{})
|
||||
client := newTestClient(t, func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, dockerPath("/events"), r.URL.Path)
|
||||
|
||||
flusher, ok := w.(http.Flusher)
|
||||
require.True(t, ok)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
flusher.Flush()
|
||||
|
||||
// Container start event
|
||||
writeEvent(t, w, "container", "start", "cont-1", map[string]string{
|
||||
"image": "galaxy/game:1.2.3",
|
||||
"name": "galaxy-game-game-1",
|
||||
"com.galaxy.game_id": "game-1",
|
||||
}, time.Now())
|
||||
flusher.Flush()
|
||||
|
||||
// Container die event with exit code 137
|
||||
writeEvent(t, w, "container", "die", "cont-1", map[string]string{
|
||||
"exitCode": "137",
|
||||
}, time.Now())
|
||||
flusher.Flush()
|
||||
|
||||
// Image event must be filtered out by adapter
|
||||
writeEvent(t, w, "image", "pull", "img", nil, time.Now())
|
||||
flusher.Flush()
|
||||
|
||||
<-mu
|
||||
})
|
||||
defer close(mu)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
events, _, err := client.EventsListen(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
got := []ports.DockerEvent{}
|
||||
deadline := time.After(2 * time.Second)
|
||||
for len(got) < 2 {
|
||||
select {
|
||||
case ev, ok := <-events:
|
||||
if !ok {
|
||||
t.Fatalf("events channel closed; got %d events", len(got))
|
||||
}
|
||||
got = append(got, ev)
|
||||
case <-deadline:
|
||||
t.Fatalf("did not receive expected events; have %d", len(got))
|
||||
}
|
||||
}
|
||||
require.Len(t, got, 2)
|
||||
assert.Equal(t, "start", got[0].Action)
|
||||
assert.Equal(t, "cont-1", got[0].ContainerID)
|
||||
assert.Equal(t, "game-1", got[0].Labels["com.galaxy.game_id"])
|
||||
assert.Equal(t, "die", got[1].Action)
|
||||
assert.Equal(t, 137, got[1].ExitCode)
|
||||
}
|
||||
|
||||
func writeEvent(t *testing.T, w io.Writer, eventType, action, id string, attributes map[string]string, when time.Time) {
|
||||
t.Helper()
|
||||
payload := map[string]any{
|
||||
"Type": eventType,
|
||||
"Action": action,
|
||||
"Actor": map[string]any{"ID": id, "Attributes": attributes},
|
||||
"time": when.Unix(),
|
||||
"timeNano": when.UnixNano(),
|
||||
}
|
||||
data, err := json.Marshal(payload)
|
||||
require.NoError(t, err)
|
||||
_, err = fmt.Fprintln(w, string(data))
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// Sanity: parsing helpers.
|
||||
func TestParseLogOpts(t *testing.T) {
|
||||
got := parseLogOpts("max-size=1m,max-file=3, ,empty=,=novalue")
|
||||
assert.Equal(t, "1m", got["max-size"])
|
||||
assert.Equal(t, "3", got["max-file"])
|
||||
assert.Equal(t, "", got["empty"])
|
||||
_, hasNovalue := got["=novalue"]
|
||||
assert.False(t, hasNovalue)
|
||||
}
|
||||
|
||||
func TestParseDockerTime(t *testing.T) {
|
||||
assert.True(t, parseDockerTime("").IsZero())
|
||||
assert.True(t, parseDockerTime("not-a-date").IsZero())
|
||||
parsed := parseDockerTime("2026-04-27T11:00:00.5Z")
|
||||
assert.False(t, parsed.IsZero())
|
||||
assert.Equal(t, time.UTC, parsed.Location())
|
||||
}
|
||||
|
||||
func TestEnvMapToSliceDeterministicLength(t *testing.T) {
|
||||
got := envMapToSlice(map[string]string{"A": "1", "B": "2"})
|
||||
assert.Len(t, got, 2)
|
||||
for _, kv := range got {
|
||||
assert.Contains(t, []string{"A=1", "B=2"}, kv)
|
||||
}
|
||||
assert.Nil(t, envMapToSlice(nil))
|
||||
}
|
||||
|
||||
// Compile-time sanity: make sure errors.Is wiring stays intact.
|
||||
func TestSentinelErrorsAreDistinct(t *testing.T) {
|
||||
require.True(t, errors.Is(ports.ErrNetworkMissing, ports.ErrNetworkMissing))
|
||||
require.False(t, errors.Is(ports.ErrNetworkMissing, ports.ErrImageNotFound))
|
||||
}
|
||||
|
||||
func TestURLPathEscapingForCharacters(t *testing.T) {
|
||||
// Ensure the SDK URL path encodes special characters; the adapter
|
||||
// passes raw inputs through and lets the SDK escape.
|
||||
encoded := url.PathEscape("game-1")
|
||||
assert.Equal(t, "game-1", encoded)
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
// Code generated by MockGen. DO NOT EDIT.
|
||||
// Source: galaxy/rtmanager/internal/ports (interfaces: DockerClient)
|
||||
//
|
||||
// Generated by this command:
|
||||
//
|
||||
// mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient
|
||||
//
|
||||
|
||||
// Package mocks is a generated GoMock package.
|
||||
package mocks
|
||||
|
||||
import (
|
||||
context "context"
|
||||
ports "galaxy/rtmanager/internal/ports"
|
||||
reflect "reflect"
|
||||
time "time"
|
||||
|
||||
gomock "go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// MockDockerClient is a mock of DockerClient interface.
|
||||
type MockDockerClient struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockDockerClientMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockDockerClientMockRecorder is the mock recorder for MockDockerClient.
|
||||
type MockDockerClientMockRecorder struct {
|
||||
mock *MockDockerClient
|
||||
}
|
||||
|
||||
// NewMockDockerClient creates a new mock instance.
|
||||
func NewMockDockerClient(ctrl *gomock.Controller) *MockDockerClient {
|
||||
mock := &MockDockerClient{ctrl: ctrl}
|
||||
mock.recorder = &MockDockerClientMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockDockerClient) EXPECT() *MockDockerClientMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// EnsureNetwork mocks base method.
|
||||
func (m *MockDockerClient) EnsureNetwork(ctx context.Context, name string) error {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "EnsureNetwork", ctx, name)
|
||||
ret0, _ := ret[0].(error)
|
||||
return ret0
|
||||
}
|
||||
|
||||
// EnsureNetwork indicates an expected call of EnsureNetwork.
|
||||
func (mr *MockDockerClientMockRecorder) EnsureNetwork(ctx, name any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EnsureNetwork", reflect.TypeOf((*MockDockerClient)(nil).EnsureNetwork), ctx, name)
|
||||
}
|
||||
|
||||
// EventsListen mocks base method.
|
||||
func (m *MockDockerClient) EventsListen(ctx context.Context) (<-chan ports.DockerEvent, <-chan error, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "EventsListen", ctx)
|
||||
ret0, _ := ret[0].(<-chan ports.DockerEvent)
|
||||
ret1, _ := ret[1].(<-chan error)
|
||||
ret2, _ := ret[2].(error)
|
||||
return ret0, ret1, ret2
|
||||
}
|
||||
|
||||
// EventsListen indicates an expected call of EventsListen.
|
||||
func (mr *MockDockerClientMockRecorder) EventsListen(ctx any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EventsListen", reflect.TypeOf((*MockDockerClient)(nil).EventsListen), ctx)
|
||||
}
|
||||
|
||||
// InspectContainer mocks base method.
|
||||
func (m *MockDockerClient) InspectContainer(ctx context.Context, containerID string) (ports.ContainerInspect, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "InspectContainer", ctx, containerID)
|
||||
ret0, _ := ret[0].(ports.ContainerInspect)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// InspectContainer indicates an expected call of InspectContainer.
|
||||
func (mr *MockDockerClientMockRecorder) InspectContainer(ctx, containerID any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectContainer", reflect.TypeOf((*MockDockerClient)(nil).InspectContainer), ctx, containerID)
|
||||
}
|
||||
|
||||
// InspectImage mocks base method.
|
||||
func (m *MockDockerClient) InspectImage(ctx context.Context, ref string) (ports.ImageInspect, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "InspectImage", ctx, ref)
|
||||
ret0, _ := ret[0].(ports.ImageInspect)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// InspectImage indicates an expected call of InspectImage.
|
||||
func (mr *MockDockerClientMockRecorder) InspectImage(ctx, ref any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InspectImage", reflect.TypeOf((*MockDockerClient)(nil).InspectImage), ctx, ref)
|
||||
}
|
||||
|
||||
// List mocks base method.
|
||||
func (m *MockDockerClient) List(ctx context.Context, filter ports.ListFilter) ([]ports.ContainerSummary, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "List", ctx, filter)
|
||||
ret0, _ := ret[0].([]ports.ContainerSummary)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// List indicates an expected call of List.
|
||||
func (mr *MockDockerClientMockRecorder) List(ctx, filter any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "List", reflect.TypeOf((*MockDockerClient)(nil).List), ctx, filter)
|
||||
}
|
||||
|
||||
// PullImage mocks base method.
|
||||
func (m *MockDockerClient) PullImage(ctx context.Context, ref string, policy ports.PullPolicy) error {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "PullImage", ctx, ref, policy)
|
||||
ret0, _ := ret[0].(error)
|
||||
return ret0
|
||||
}
|
||||
|
||||
// PullImage indicates an expected call of PullImage.
|
||||
func (mr *MockDockerClientMockRecorder) PullImage(ctx, ref, policy any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PullImage", reflect.TypeOf((*MockDockerClient)(nil).PullImage), ctx, ref, policy)
|
||||
}
|
||||
|
||||
// Remove mocks base method.
|
||||
func (m *MockDockerClient) Remove(ctx context.Context, containerID string) error {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Remove", ctx, containerID)
|
||||
ret0, _ := ret[0].(error)
|
||||
return ret0
|
||||
}
|
||||
|
||||
// Remove indicates an expected call of Remove.
|
||||
func (mr *MockDockerClientMockRecorder) Remove(ctx, containerID any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Remove", reflect.TypeOf((*MockDockerClient)(nil).Remove), ctx, containerID)
|
||||
}
|
||||
|
||||
// Run mocks base method.
|
||||
func (m *MockDockerClient) Run(ctx context.Context, spec ports.RunSpec) (ports.RunResult, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Run", ctx, spec)
|
||||
ret0, _ := ret[0].(ports.RunResult)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Run indicates an expected call of Run.
|
||||
func (mr *MockDockerClientMockRecorder) Run(ctx, spec any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Run", reflect.TypeOf((*MockDockerClient)(nil).Run), ctx, spec)
|
||||
}
|
||||
|
||||
// Stop mocks base method.
|
||||
func (m *MockDockerClient) Stop(ctx context.Context, containerID string, timeout time.Duration) error {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Stop", ctx, containerID, timeout)
|
||||
ret0, _ := ret[0].(error)
|
||||
return ret0
|
||||
}
|
||||
|
||||
// Stop indicates an expected call of Stop.
|
||||
func (mr *MockDockerClientMockRecorder) Stop(ctx, containerID, timeout any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockDockerClient)(nil).Stop), ctx, containerID, timeout)
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package mocks
|
||||
|
||||
import (
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
// Compile-time assertion that the generated mock satisfies the port
|
||||
// interface. Future signature drift between the port and the generated
|
||||
// file fails the build at this line, which is more actionable than a
|
||||
// runtime check from a service test.
|
||||
var _ ports.DockerClient = (*MockDockerClient)(nil)
|
||||
@@ -0,0 +1,202 @@
|
||||
// Package docker smoke tests exercise the production adapter against a
|
||||
// real Docker daemon. The tests skip when no Docker socket is reachable
|
||||
// (`skipUnlessDockerAvailable`), so they run in the default
|
||||
// `go test ./...` pass without a build tag.
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/network"
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
const (
|
||||
smokeImage = "alpine:3.21"
|
||||
smokeNetPrefix = "rtmanager-smoke-"
|
||||
)
|
||||
|
||||
func skipUnlessDockerAvailable(t *testing.T) {
|
||||
t.Helper()
|
||||
if os.Getenv("DOCKER_HOST") == "" {
|
||||
if _, err := os.Stat("/var/run/docker.sock"); err != nil {
|
||||
t.Skip("docker daemon not available; set DOCKER_HOST or expose /var/run/docker.sock")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func newSmokeAdapter(t *testing.T) (*Client, *dockerclient.Client) {
|
||||
t.Helper()
|
||||
|
||||
docker, err := dockerclient.NewClientWithOpts(dockerclient.FromEnv, dockerclient.WithAPIVersionNegotiation())
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() { _ = docker.Close() })
|
||||
|
||||
pingCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if _, err := docker.Ping(pingCtx); err != nil {
|
||||
// A reachable socket path may still be unusable in sandboxed
|
||||
// environments (e.g., macOS sandbox blocking the colima socket).
|
||||
// The smoke test can only run when the daemon answers ping, so a
|
||||
// permission-denied / connection-refused error is a runtime
|
||||
// "Docker unavailable" signal and skips the test.
|
||||
t.Skipf("docker daemon unavailable: %v", err)
|
||||
}
|
||||
|
||||
adapter, err := NewClient(Config{
|
||||
Docker: docker,
|
||||
LogDriver: "json-file",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return adapter, docker
|
||||
}
|
||||
|
||||
func uniqueSuffix(t *testing.T) string {
|
||||
t.Helper()
|
||||
buf := make([]byte, 4)
|
||||
_, err := rand.Read(buf)
|
||||
require.NoError(t, err)
|
||||
return hex.EncodeToString(buf)
|
||||
}
|
||||
|
||||
// TestSmokeFullLifecycle runs the adapter through every method against
|
||||
// the real Docker daemon: ensure-network → pull → run → events →
|
||||
// stop → remove.
|
||||
func TestSmokeFullLifecycle(t *testing.T) {
|
||||
skipUnlessDockerAvailable(t)
|
||||
|
||||
adapter, docker := newSmokeAdapter(t)
|
||||
|
||||
suffix := uniqueSuffix(t)
|
||||
netName := smokeNetPrefix + suffix
|
||||
containerName := "rtmanager-smoke-cont-" + suffix
|
||||
|
||||
// Step 1 — provision a temporary user-defined bridge network.
|
||||
createCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
_, err := docker.NetworkCreate(createCtx, netName, network.CreateOptions{Driver: "bridge"})
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer removeCancel()
|
||||
_ = docker.NetworkRemove(removeCtx, netName)
|
||||
})
|
||||
|
||||
// Step 2 — EnsureNetwork present and missing paths.
|
||||
require.NoError(t, adapter.EnsureNetwork(createCtx, netName))
|
||||
missingErr := adapter.EnsureNetwork(createCtx, "rtmanager-smoke-missing-"+suffix)
|
||||
require.Error(t, missingErr)
|
||||
assert.ErrorIs(t, missingErr, ports.ErrNetworkMissing)
|
||||
|
||||
// Step 3 — pull alpine via the configured policy.
|
||||
pullCtx, pullCancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer pullCancel()
|
||||
require.NoError(t, adapter.PullImage(pullCtx, smokeImage, ports.PullPolicyIfMissing))
|
||||
|
||||
// Step 4 — subscribe to events before running the container so we
|
||||
// observe the start event.
|
||||
listenCtx, listenCancel := context.WithCancel(context.Background())
|
||||
defer listenCancel()
|
||||
events, listenErrs, err := adapter.EventsListen(listenCtx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Step 5 — run a tiny container that sleeps so we can observe it.
|
||||
stateDir := t.TempDir()
|
||||
runCtx, runCancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer runCancel()
|
||||
result, err := adapter.Run(runCtx, ports.RunSpec{
|
||||
Name: containerName,
|
||||
Image: smokeImage,
|
||||
Hostname: "smoke-" + suffix,
|
||||
Network: netName,
|
||||
Env: map[string]string{
|
||||
"GAME_STATE_PATH": "/tmp/state",
|
||||
"STORAGE_PATH": "/tmp/state",
|
||||
},
|
||||
Labels: map[string]string{
|
||||
"com.galaxy.owner": "rtmanager",
|
||||
"com.galaxy.kind": "smoke",
|
||||
},
|
||||
BindMounts: []ports.BindMount{
|
||||
{HostPath: stateDir, MountPath: "/tmp/state"},
|
||||
},
|
||||
LogDriver: "json-file",
|
||||
CPUQuota: 0.5,
|
||||
Memory: "64m",
|
||||
PIDsLimit: 32,
|
||||
Cmd: []string{"/bin/sh", "-c", "sleep 60"},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer removeCancel()
|
||||
_ = adapter.Remove(removeCtx, result.ContainerID)
|
||||
})
|
||||
|
||||
require.NotEmpty(t, result.ContainerID)
|
||||
require.Equal(t, "http://smoke-"+suffix+":8080", result.EngineEndpoint)
|
||||
|
||||
// Step 6 — wait for a `start` event for the new container id.
|
||||
startObserved := waitForEvent(t, events, listenErrs, "start", result.ContainerID, 15*time.Second)
|
||||
require.True(t, startObserved, "did not observe start event for container %s", result.ContainerID)
|
||||
|
||||
// Step 7 — InspectContainer returns running state.
|
||||
inspectCtx, inspectCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer inspectCancel()
|
||||
inspect, err := adapter.InspectContainer(inspectCtx, result.ContainerID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "running", inspect.Status)
|
||||
|
||||
// Step 8 — Stop, then Remove, then InspectContainer must report
|
||||
// not found.
|
||||
stopCtx, stopCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer stopCancel()
|
||||
require.NoError(t, adapter.Stop(stopCtx, result.ContainerID, 5*time.Second))
|
||||
|
||||
require.NoError(t, adapter.Remove(stopCtx, result.ContainerID))
|
||||
|
||||
if _, err := adapter.InspectContainer(stopCtx, result.ContainerID); !errors.Is(err, ports.ErrContainerNotFound) {
|
||||
t.Fatalf("expected ErrContainerNotFound, got %v", err)
|
||||
}
|
||||
|
||||
// Step 9 — terminate the events subscription cleanly.
|
||||
listenCancel()
|
||||
select {
|
||||
case _, ok := <-events:
|
||||
_ = ok
|
||||
case <-time.After(5 * time.Second):
|
||||
t.Log("events channel did not close within timeout (best-effort)")
|
||||
}
|
||||
}
|
||||
|
||||
func waitForEvent(t *testing.T, events <-chan ports.DockerEvent, errs <-chan error, action, containerID string, timeout time.Duration) bool {
|
||||
t.Helper()
|
||||
deadline := time.After(timeout)
|
||||
for {
|
||||
select {
|
||||
case ev, ok := <-events:
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
if ev.Action == action && ev.ContainerID == containerID {
|
||||
return true
|
||||
}
|
||||
case err := <-errs:
|
||||
if err != nil {
|
||||
t.Fatalf("events stream error: %v", err)
|
||||
}
|
||||
case <-deadline:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
// Package healtheventspublisher provides the Redis-Streams-backed
|
||||
// publisher for `runtime:health_events`. Every Publish call upserts the
|
||||
// latest `health_snapshots` row before XADDing the event so consumers
|
||||
// observing the snapshot store can never lag the event stream by more
|
||||
// than the duration of one network call.
|
||||
//
|
||||
// The publisher is shared across `ports.HealthEventPublisher` callers:
|
||||
// the start service emits `container_started`; the probe, inspect, and
|
||||
// events-listener workers emit the rest. The publisher's surface is
|
||||
// stable across all of them.
|
||||
package healtheventspublisher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// emptyDetails is the canonical JSON payload installed when the caller
|
||||
// supplies an empty Details slice. Matches the SQL DEFAULT for
|
||||
// `health_snapshots.details`.
|
||||
const emptyDetails = "{}"
|
||||
|
||||
// Wire field names used by the Redis Streams payload. Frozen by
|
||||
// `rtmanager/api/runtime-health-asyncapi.yaml`; renaming any of them
|
||||
// breaks consumers.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldContainerID = "container_id"
|
||||
fieldEventType = "event_type"
|
||||
fieldOccurredAtMS = "occurred_at_ms"
|
||||
fieldDetails = "details"
|
||||
)
|
||||
|
||||
// Config groups the dependencies and stream name required to construct
|
||||
// a Publisher.
|
||||
type Config struct {
|
||||
// Client appends entries to the Redis Stream. Must be non-nil.
|
||||
Client *redis.Client
|
||||
|
||||
// Snapshots upserts the latest health snapshot. Must be non-nil.
|
||||
Snapshots ports.HealthSnapshotStore
|
||||
|
||||
// Stream stores the Redis Stream key events are published to (e.g.
|
||||
// `runtime:health_events`). Must not be empty.
|
||||
Stream string
|
||||
}
|
||||
|
||||
// Publisher implements `ports.HealthEventPublisher` on top of a shared
|
||||
// Redis client and the production `health_snapshots` store.
|
||||
type Publisher struct {
|
||||
client *redis.Client
|
||||
snapshots ports.HealthSnapshotStore
|
||||
stream string
|
||||
}
|
||||
|
||||
// NewPublisher constructs one Publisher from cfg. Validation errors
|
||||
// surface the missing collaborator verbatim.
|
||||
func NewPublisher(cfg Config) (*Publisher, error) {
|
||||
if cfg.Client == nil {
|
||||
return nil, errors.New("new rtmanager health events publisher: nil redis client")
|
||||
}
|
||||
if cfg.Snapshots == nil {
|
||||
return nil, errors.New("new rtmanager health events publisher: nil snapshot store")
|
||||
}
|
||||
if cfg.Stream == "" {
|
||||
return nil, errors.New("new rtmanager health events publisher: stream must not be empty")
|
||||
}
|
||||
return &Publisher{
|
||||
client: cfg.Client,
|
||||
snapshots: cfg.Snapshots,
|
||||
stream: cfg.Stream,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Publish upserts the matching health_snapshots row and then XADDs the
|
||||
// envelope to the configured Redis Stream. Both side effects are
|
||||
// required; the snapshot upsert runs first so a successful Publish
|
||||
// always leaves the snapshot store at least as fresh as the stream.
|
||||
func (publisher *Publisher) Publish(ctx context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
if publisher == nil || publisher.client == nil || publisher.snapshots == nil {
|
||||
return errors.New("publish health event: nil publisher")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("publish health event: nil context")
|
||||
}
|
||||
if err := envelope.Validate(); err != nil {
|
||||
return fmt.Errorf("publish health event: %w", err)
|
||||
}
|
||||
|
||||
details := envelope.Details
|
||||
if len(details) == 0 {
|
||||
details = json.RawMessage(emptyDetails)
|
||||
}
|
||||
|
||||
status, source := snapshotMappingFor(envelope.EventType)
|
||||
snapshot := health.HealthSnapshot{
|
||||
GameID: envelope.GameID,
|
||||
ContainerID: envelope.ContainerID,
|
||||
Status: status,
|
||||
Source: source,
|
||||
Details: details,
|
||||
ObservedAt: envelope.OccurredAt.UTC(),
|
||||
}
|
||||
if err := publisher.snapshots.Upsert(ctx, snapshot); err != nil {
|
||||
return fmt.Errorf("publish health event: upsert snapshot: %w", err)
|
||||
}
|
||||
|
||||
occurredAtMS := envelope.OccurredAt.UTC().UnixMilli()
|
||||
values := map[string]any{
|
||||
fieldGameID: envelope.GameID,
|
||||
fieldContainerID: envelope.ContainerID,
|
||||
fieldEventType: string(envelope.EventType),
|
||||
fieldOccurredAtMS: strconv.FormatInt(occurredAtMS, 10),
|
||||
fieldDetails: string(details),
|
||||
}
|
||||
if err := publisher.client.XAdd(ctx, &redis.XAddArgs{
|
||||
Stream: publisher.stream,
|
||||
Values: values,
|
||||
}).Err(); err != nil {
|
||||
return fmt.Errorf("publish health event: xadd: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// snapshotMappingFor returns the SnapshotStatus and SnapshotSource that
|
||||
// match eventType per `rtmanager/README.md §Health Monitoring`.
|
||||
//
|
||||
// `container_started` is observed when the start service successfully
|
||||
// runs the container; the snapshot collapses it to `healthy`.
|
||||
// `probe_recovered` collapses to `healthy` per
|
||||
// `rtmanager/docs/domain-and-ports.md` §4: it does not have its own
|
||||
// snapshot status; the next observation overwrites the prior
|
||||
// `probe_failed` with `healthy`.
|
||||
func snapshotMappingFor(eventType health.EventType) (health.SnapshotStatus, health.SnapshotSource) {
|
||||
switch eventType {
|
||||
case health.EventTypeContainerStarted:
|
||||
return health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent
|
||||
case health.EventTypeContainerExited:
|
||||
return health.SnapshotStatusExited, health.SnapshotSourceDockerEvent
|
||||
case health.EventTypeContainerOOM:
|
||||
return health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent
|
||||
case health.EventTypeContainerDisappeared:
|
||||
return health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent
|
||||
case health.EventTypeInspectUnhealthy:
|
||||
return health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect
|
||||
case health.EventTypeProbeFailed:
|
||||
return health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe
|
||||
case health.EventTypeProbeRecovered:
|
||||
return health.SnapshotStatusHealthy, health.SnapshotSourceProbe
|
||||
default:
|
||||
return "", ""
|
||||
}
|
||||
}
|
||||
|
||||
// Compile-time assertion: Publisher implements
|
||||
// ports.HealthEventPublisher.
|
||||
var _ ports.HealthEventPublisher = (*Publisher)(nil)
|
||||
@@ -0,0 +1,197 @@
|
||||
package healtheventspublisher_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// fakeSnapshots captures Upsert invocations for assertions.
|
||||
type fakeSnapshots struct {
|
||||
mu sync.Mutex
|
||||
upserts []health.HealthSnapshot
|
||||
upsertErr error
|
||||
}
|
||||
|
||||
func (s *fakeSnapshots) Upsert(_ context.Context, snapshot health.HealthSnapshot) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, snapshot)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeSnapshots) Get(_ context.Context, _ string) (health.HealthSnapshot, error) {
|
||||
return health.HealthSnapshot{}, nil
|
||||
}
|
||||
|
||||
func newPublisher(t *testing.T, snapshots ports.HealthSnapshotStore) (*healtheventspublisher.Publisher, *miniredis.Miniredis, *redis.Client) {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
publisher, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
|
||||
Client: client,
|
||||
Snapshots: snapshots,
|
||||
Stream: "runtime:health_events",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return publisher, server, client
|
||||
}
|
||||
|
||||
func TestNewPublisherRejectsMissingCollaborators(t *testing.T) {
|
||||
_, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{
|
||||
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
|
||||
})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = healtheventspublisher.NewPublisher(healtheventspublisher.Config{
|
||||
Client: redis.NewClient(&redis.Options{Addr: "127.0.0.1:0"}),
|
||||
Snapshots: &fakeSnapshots{},
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPublishContainerStartedUpsertsHealthyAndXAdds(t *testing.T) {
|
||||
snapshots := &fakeSnapshots{}
|
||||
publisher, _, client := newPublisher(t, snapshots)
|
||||
|
||||
occurredAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
envelope := ports.HealthEventEnvelope{
|
||||
GameID: "game-1",
|
||||
ContainerID: "c-1",
|
||||
EventType: health.EventTypeContainerStarted,
|
||||
OccurredAt: occurredAt,
|
||||
Details: json.RawMessage(`{"image_ref":"galaxy/game:1.2.3"}`),
|
||||
}
|
||||
require.NoError(t, publisher.Publish(context.Background(), envelope))
|
||||
|
||||
require.Len(t, snapshots.upserts, 1)
|
||||
snapshot := snapshots.upserts[0]
|
||||
assert.Equal(t, "game-1", snapshot.GameID)
|
||||
assert.Equal(t, "c-1", snapshot.ContainerID)
|
||||
assert.Equal(t, health.SnapshotStatusHealthy, snapshot.Status)
|
||||
assert.Equal(t, health.SnapshotSourceDockerEvent, snapshot.Source)
|
||||
assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, string(snapshot.Details))
|
||||
assert.Equal(t, occurredAt, snapshot.ObservedAt)
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-1", values["game_id"])
|
||||
assert.Equal(t, "c-1", values["container_id"])
|
||||
assert.Equal(t, "container_started", values["event_type"])
|
||||
assert.Equal(t, strconv.FormatInt(occurredAt.UnixMilli(), 10), values["occurred_at_ms"])
|
||||
assert.JSONEq(t, `{"image_ref":"galaxy/game:1.2.3"}`, values["details"].(string))
|
||||
}
|
||||
|
||||
func TestPublishMapsEveryEventTypeToASnapshot(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
eventType health.EventType
|
||||
expectStatus health.SnapshotStatus
|
||||
expectSource health.SnapshotSource
|
||||
}{
|
||||
{health.EventTypeContainerStarted, health.SnapshotStatusHealthy, health.SnapshotSourceDockerEvent},
|
||||
{health.EventTypeContainerExited, health.SnapshotStatusExited, health.SnapshotSourceDockerEvent},
|
||||
{health.EventTypeContainerOOM, health.SnapshotStatusOOM, health.SnapshotSourceDockerEvent},
|
||||
{health.EventTypeContainerDisappeared, health.SnapshotStatusContainerDisappeared, health.SnapshotSourceDockerEvent},
|
||||
{health.EventTypeInspectUnhealthy, health.SnapshotStatusInspectUnhealthy, health.SnapshotSourceInspect},
|
||||
{health.EventTypeProbeFailed, health.SnapshotStatusProbeFailed, health.SnapshotSourceProbe},
|
||||
{health.EventTypeProbeRecovered, health.SnapshotStatusHealthy, health.SnapshotSourceProbe},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(string(tc.eventType), func(t *testing.T) {
|
||||
t.Parallel()
|
||||
snapshots := &fakeSnapshots{}
|
||||
publisher, _, _ := newPublisher(t, snapshots)
|
||||
require.NoError(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{
|
||||
GameID: "g",
|
||||
ContainerID: "c",
|
||||
EventType: tc.eventType,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
Details: json.RawMessage(`{}`),
|
||||
}))
|
||||
require.Len(t, snapshots.upserts, 1)
|
||||
assert.Equal(t, tc.expectStatus, snapshots.upserts[0].Status)
|
||||
assert.Equal(t, tc.expectSource, snapshots.upserts[0].Source)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestPublishEmptyDetailsBecomesEmptyObject(t *testing.T) {
|
||||
snapshots := &fakeSnapshots{}
|
||||
publisher, _, client := newPublisher(t, snapshots)
|
||||
|
||||
envelope := ports.HealthEventEnvelope{
|
||||
GameID: "g",
|
||||
ContainerID: "c",
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
}
|
||||
require.NoError(t, publisher.Publish(context.Background(), envelope))
|
||||
|
||||
require.Len(t, snapshots.upserts, 1)
|
||||
assert.JSONEq(t, "{}", string(snapshots.upserts[0].Details))
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
assert.JSONEq(t, "{}", entries[0].Values["details"].(string))
|
||||
}
|
||||
|
||||
func TestPublishRejectsInvalidEnvelope(t *testing.T) {
|
||||
snapshots := &fakeSnapshots{}
|
||||
publisher, _, client := newPublisher(t, snapshots)
|
||||
|
||||
require.Error(t, publisher.Publish(context.Background(), ports.HealthEventEnvelope{}))
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, entries)
|
||||
assert.Empty(t, snapshots.upserts)
|
||||
}
|
||||
|
||||
func TestPublishSurfacesSnapshotErrorWithoutXAdd(t *testing.T) {
|
||||
snapshots := &fakeSnapshots{upsertErr: assertSentinelErr}
|
||||
publisher, _, client := newPublisher(t, snapshots)
|
||||
|
||||
err := publisher.Publish(context.Background(), ports.HealthEventEnvelope{
|
||||
GameID: "g",
|
||||
ContainerID: "c",
|
||||
EventType: health.EventTypeContainerStarted,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
Details: json.RawMessage(`{"image_ref":"x"}`),
|
||||
})
|
||||
require.Error(t, err)
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:health_events", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, entries, "xadd must not run when snapshot upsert fails")
|
||||
}
|
||||
|
||||
// assertSentinelErr is a sentinel for snapshot-failure assertions.
|
||||
var assertSentinelErr = sentinelError("snapshot upsert failure")
|
||||
|
||||
type sentinelError string
|
||||
|
||||
func (s sentinelError) Error() string { return string(s) }
|
||||
@@ -0,0 +1,100 @@
|
||||
// Package jobresultspublisher provides the Redis-Streams-backed
|
||||
// publisher for `runtime:job_results`. The start-jobs and stop-jobs
|
||||
// consumers call this adapter so every consumed envelope produces
|
||||
// exactly one outcome entry on the result stream.
|
||||
//
|
||||
// The wire fields mirror the AsyncAPI schema frozen in
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Every field is XADDed
|
||||
// even when empty so consumers can rely on the schema's required-field
|
||||
// set.
|
||||
package jobresultspublisher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Wire field names used by the Redis Streams payload. Frozen by
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`; renaming any of them
|
||||
// breaks consumers.
|
||||
const (
|
||||
fieldGameID = "game_id"
|
||||
fieldOutcome = "outcome"
|
||||
fieldContainerID = "container_id"
|
||||
fieldEngineEndpoint = "engine_endpoint"
|
||||
fieldErrorCode = "error_code"
|
||||
fieldErrorMessage = "error_message"
|
||||
)
|
||||
|
||||
// Config groups the dependencies and stream name required to construct
|
||||
// a Publisher.
|
||||
type Config struct {
|
||||
// Client appends entries to the Redis Stream. Must be non-nil.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Stream key job results are published to
|
||||
// (e.g. `runtime:job_results`). Must not be empty.
|
||||
Stream string
|
||||
}
|
||||
|
||||
// Publisher implements `ports.JobResultPublisher` on top of a shared
|
||||
// Redis client.
|
||||
type Publisher struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
}
|
||||
|
||||
// NewPublisher constructs one Publisher from cfg. Validation errors
|
||||
// surface the missing collaborator verbatim.
|
||||
func NewPublisher(cfg Config) (*Publisher, error) {
|
||||
if cfg.Client == nil {
|
||||
return nil, errors.New("new rtmanager job results publisher: nil redis client")
|
||||
}
|
||||
if strings.TrimSpace(cfg.Stream) == "" {
|
||||
return nil, errors.New("new rtmanager job results publisher: stream must not be empty")
|
||||
}
|
||||
return &Publisher{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Publish XADDs result to the configured Redis Stream. The wire payload
|
||||
// includes every field declared as required by the AsyncAPI schema —
|
||||
// empty strings are kept so consumers always see the documented keys.
|
||||
func (publisher *Publisher) Publish(ctx context.Context, result ports.JobResult) error {
|
||||
if publisher == nil || publisher.client == nil {
|
||||
return errors.New("publish job result: nil publisher")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("publish job result: nil context")
|
||||
}
|
||||
if err := result.Validate(); err != nil {
|
||||
return fmt.Errorf("publish job result: %w", err)
|
||||
}
|
||||
|
||||
values := map[string]any{
|
||||
fieldGameID: result.GameID,
|
||||
fieldOutcome: result.Outcome,
|
||||
fieldContainerID: result.ContainerID,
|
||||
fieldEngineEndpoint: result.EngineEndpoint,
|
||||
fieldErrorCode: result.ErrorCode,
|
||||
fieldErrorMessage: result.ErrorMessage,
|
||||
}
|
||||
if err := publisher.client.XAdd(ctx, &redis.XAddArgs{
|
||||
Stream: publisher.stream,
|
||||
Values: values,
|
||||
}).Err(); err != nil {
|
||||
return fmt.Errorf("publish job result: xadd: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion: Publisher implements ports.JobResultPublisher.
|
||||
var _ ports.JobResultPublisher = (*Publisher)(nil)
|
||||
@@ -0,0 +1,142 @@
|
||||
package jobresultspublisher_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newPublisher(t *testing.T) (*jobresultspublisher.Publisher, *redis.Client) {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:job_results",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return publisher, client
|
||||
}
|
||||
|
||||
func TestNewPublisherRejectsMissingCollaborators(t *testing.T) {
|
||||
_, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{})
|
||||
require.Error(t, err)
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
_, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = jobresultspublisher.NewPublisher(jobresultspublisher.Config{Client: client, Stream: " "})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPublishRejectsInvalidResult(t *testing.T) {
|
||||
publisher, _ := newPublisher(t)
|
||||
|
||||
require.Error(t, publisher.Publish(context.Background(), ports.JobResult{}))
|
||||
require.Error(t, publisher.Publish(context.Background(), ports.JobResult{
|
||||
GameID: "game-1",
|
||||
Outcome: "weird",
|
||||
}))
|
||||
}
|
||||
|
||||
func TestPublishStartSuccessXAddsAllRequiredFields(t *testing.T) {
|
||||
publisher, client := newPublisher(t)
|
||||
|
||||
result := ports.JobResult{
|
||||
GameID: "game-1",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-1",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
ErrorCode: "",
|
||||
ErrorMessage: "",
|
||||
}
|
||||
require.NoError(t, publisher.Publish(context.Background(), result))
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-1", values["game_id"])
|
||||
assert.Equal(t, "success", values["outcome"])
|
||||
assert.Equal(t, "c-1", values["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", values["engine_endpoint"])
|
||||
assert.Equal(t, "", values["error_code"])
|
||||
assert.Equal(t, "", values["error_message"])
|
||||
}
|
||||
|
||||
func TestPublishFailureXAddsEmptyContainerAndEndpoint(t *testing.T) {
|
||||
publisher, client := newPublisher(t)
|
||||
|
||||
result := ports.JobResult{
|
||||
GameID: "game-2",
|
||||
Outcome: ports.JobOutcomeFailure,
|
||||
ErrorCode: "image_pull_failed",
|
||||
ErrorMessage: "manifest unknown",
|
||||
}
|
||||
require.NoError(t, publisher.Publish(context.Background(), result))
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-2", values["game_id"])
|
||||
assert.Equal(t, "failure", values["outcome"])
|
||||
assert.Equal(t, "", values["container_id"], "failure must publish empty container id")
|
||||
assert.Equal(t, "", values["engine_endpoint"], "failure must publish empty engine endpoint")
|
||||
assert.Equal(t, "image_pull_failed", values["error_code"])
|
||||
assert.Equal(t, "manifest unknown", values["error_message"])
|
||||
}
|
||||
|
||||
func TestPublishReplayNoOpKeepsContainerAndEndpoint(t *testing.T) {
|
||||
publisher, client := newPublisher(t)
|
||||
|
||||
result := ports.JobResult{
|
||||
GameID: "game-3",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
ContainerID: "c-3",
|
||||
EngineEndpoint: "http://galaxy-game-game-3:8080",
|
||||
ErrorCode: "replay_no_op",
|
||||
}
|
||||
require.NoError(t, publisher.Publish(context.Background(), result))
|
||||
|
||||
entries, err := client.XRange(context.Background(), "runtime:job_results", "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
values := entries[0].Values
|
||||
assert.Equal(t, "game-3", values["game_id"])
|
||||
assert.Equal(t, "success", values["outcome"])
|
||||
assert.Equal(t, "c-3", values["container_id"])
|
||||
assert.Equal(t, "http://galaxy-game-game-3:8080", values["engine_endpoint"])
|
||||
assert.Equal(t, "replay_no_op", values["error_code"])
|
||||
assert.Equal(t, "", values["error_message"])
|
||||
}
|
||||
|
||||
func TestPublishFailsOnClosedClient(t *testing.T) {
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
publisher, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
||||
Client: client,
|
||||
Stream: "runtime:job_results",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, client.Close())
|
||||
|
||||
err = publisher.Publish(context.Background(), ports.JobResult{
|
||||
GameID: "game-4",
|
||||
Outcome: ports.JobOutcomeSuccess,
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,219 @@
|
||||
// Package lobbyclient provides the trusted-internal Lobby REST client
|
||||
// Runtime Manager uses to fetch ancillary game metadata for diagnostics.
|
||||
//
|
||||
// The client is intentionally minimal: the GetGame fetch is ancillary
|
||||
// diagnostics because the start envelope already carries the only
|
||||
// required field (`image_ref`). A failed call surfaces as
|
||||
// `ports.ErrLobbyUnavailable` so callers can distinguish "not found"
|
||||
// from transport faults and continue without aborting the start
|
||||
// operation.
|
||||
package lobbyclient
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
const (
|
||||
getGamePathSuffix = "/api/v1/internal/games/%s"
|
||||
)
|
||||
|
||||
// Config configures one HTTP-backed Lobby internal client.
|
||||
type Config struct {
|
||||
// BaseURL stores the absolute base URL of the Lobby internal HTTP
|
||||
// listener (e.g. `http://lobby:8095`).
|
||||
BaseURL string
|
||||
|
||||
// RequestTimeout bounds one outbound lookup request.
|
||||
RequestTimeout time.Duration
|
||||
}
|
||||
|
||||
// Client resolves Lobby game records through the trusted internal HTTP
|
||||
// API.
|
||||
type Client struct {
|
||||
baseURL string
|
||||
requestTimeout time.Duration
|
||||
httpClient *http.Client
|
||||
closeIdleConnections func()
|
||||
}
|
||||
|
||||
type gameRecordEnvelope struct {
|
||||
GameID string `json:"game_id"`
|
||||
Status string `json:"status"`
|
||||
TargetEngineVersion string `json:"target_engine_version"`
|
||||
}
|
||||
|
||||
type errorEnvelope struct {
|
||||
Error *errorBody `json:"error"`
|
||||
}
|
||||
|
||||
type errorBody struct {
|
||||
Code string `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// NewClient constructs a Lobby internal client that uses
|
||||
// repository-standard HTTP transport instrumentation through otelhttp.
|
||||
// The cloned default transport keeps the production wiring isolated
|
||||
// from caller-provided transports.
|
||||
func NewClient(cfg Config) (*Client, error) {
|
||||
transport, ok := http.DefaultTransport.(*http.Transport)
|
||||
if !ok {
|
||||
return nil, errors.New("new lobby internal client: default transport is not *http.Transport")
|
||||
}
|
||||
cloned := transport.Clone()
|
||||
return newClient(cfg, &http.Client{Transport: otelhttp.NewTransport(cloned)}, cloned.CloseIdleConnections)
|
||||
}
|
||||
|
||||
func newClient(cfg Config, httpClient *http.Client, closeIdleConnections func()) (*Client, error) {
|
||||
switch {
|
||||
case strings.TrimSpace(cfg.BaseURL) == "":
|
||||
return nil, errors.New("new lobby internal client: base URL must not be empty")
|
||||
case cfg.RequestTimeout <= 0:
|
||||
return nil, errors.New("new lobby internal client: request timeout must be positive")
|
||||
case httpClient == nil:
|
||||
return nil, errors.New("new lobby internal client: http client must not be nil")
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(strings.TrimRight(strings.TrimSpace(cfg.BaseURL), "/"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new lobby internal client: parse base URL: %w", err)
|
||||
}
|
||||
if parsed.Scheme == "" || parsed.Host == "" {
|
||||
return nil, errors.New("new lobby internal client: base URL must be absolute")
|
||||
}
|
||||
|
||||
return &Client{
|
||||
baseURL: parsed.String(),
|
||||
requestTimeout: cfg.RequestTimeout,
|
||||
httpClient: httpClient,
|
||||
closeIdleConnections: closeIdleConnections,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close releases idle HTTP connections owned by the client transport.
|
||||
// Call once on shutdown.
|
||||
func (client *Client) Close() error {
|
||||
if client == nil || client.closeIdleConnections == nil {
|
||||
return nil
|
||||
}
|
||||
client.closeIdleConnections()
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetGame returns the Lobby game record for gameID. It maps Lobby's
|
||||
// `404 not_found` to `ports.ErrLobbyGameNotFound`; every other failure
|
||||
// (transport, timeout, non-2xx response) maps to
|
||||
// `ports.ErrLobbyUnavailable` wrapped with the original error so callers
|
||||
// keep the diagnostic detail.
|
||||
func (client *Client) GetGame(ctx context.Context, gameID string) (ports.LobbyGameRecord, error) {
|
||||
if client == nil || client.httpClient == nil {
|
||||
return ports.LobbyGameRecord{}, errors.New("lobby get game: nil client")
|
||||
}
|
||||
if ctx == nil {
|
||||
return ports.LobbyGameRecord{}, errors.New("lobby get game: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return ports.LobbyGameRecord{}, err
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return ports.LobbyGameRecord{}, errors.New("lobby get game: game id must not be empty")
|
||||
}
|
||||
|
||||
payload, statusCode, err := client.doRequest(ctx, http.MethodGet, fmt.Sprintf(getGamePathSuffix, url.PathEscape(gameID)))
|
||||
if err != nil {
|
||||
return ports.LobbyGameRecord{}, fmt.Errorf("%w: %w", ports.ErrLobbyUnavailable, err)
|
||||
}
|
||||
|
||||
switch statusCode {
|
||||
case http.StatusOK:
|
||||
var envelope gameRecordEnvelope
|
||||
if err := decodeJSONPayload(payload, &envelope); err != nil {
|
||||
return ports.LobbyGameRecord{}, fmt.Errorf("%w: decode success response: %w", ports.ErrLobbyUnavailable, err)
|
||||
}
|
||||
if strings.TrimSpace(envelope.GameID) == "" {
|
||||
return ports.LobbyGameRecord{}, fmt.Errorf("%w: success response missing game_id", ports.ErrLobbyUnavailable)
|
||||
}
|
||||
return ports.LobbyGameRecord{
|
||||
GameID: envelope.GameID,
|
||||
Status: envelope.Status,
|
||||
TargetEngineVersion: envelope.TargetEngineVersion,
|
||||
}, nil
|
||||
case http.StatusNotFound:
|
||||
return ports.LobbyGameRecord{}, ports.ErrLobbyGameNotFound
|
||||
default:
|
||||
errorCode := decodeErrorCode(payload)
|
||||
if errorCode != "" {
|
||||
return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d (error_code=%s)", ports.ErrLobbyUnavailable, statusCode, errorCode)
|
||||
}
|
||||
return ports.LobbyGameRecord{}, fmt.Errorf("%w: unexpected status %d", ports.ErrLobbyUnavailable, statusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func (client *Client) doRequest(ctx context.Context, method, requestPath string) ([]byte, int, error) {
|
||||
attemptCtx, cancel := context.WithTimeout(ctx, client.requestTimeout)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(attemptCtx, method, client.baseURL+requestPath, nil)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := client.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("read response body: %w", err)
|
||||
}
|
||||
return body, resp.StatusCode, nil
|
||||
}
|
||||
|
||||
// decodeJSONPayload tolerantly decodes a JSON object; unknown fields
|
||||
// are ignored so additive Lobby schema changes do not break us.
|
||||
func decodeJSONPayload(payload []byte, target any) error {
|
||||
decoder := json.NewDecoder(bytes.NewReader(payload))
|
||||
if err := decoder.Decode(target); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := decoder.Decode(&struct{}{}); err != io.EOF {
|
||||
if err == nil {
|
||||
return errors.New("unexpected trailing JSON input")
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func decodeErrorCode(payload []byte) string {
|
||||
if len(payload) == 0 {
|
||||
return ""
|
||||
}
|
||||
var envelope errorEnvelope
|
||||
if err := json.Unmarshal(payload, &envelope); err != nil {
|
||||
return ""
|
||||
}
|
||||
if envelope.Error == nil {
|
||||
return ""
|
||||
}
|
||||
return envelope.Error.Code
|
||||
}
|
||||
|
||||
// Compile-time assertion: Client implements ports.LobbyInternalClient.
|
||||
var _ ports.LobbyInternalClient = (*Client)(nil)
|
||||
@@ -0,0 +1,153 @@
|
||||
package lobbyclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
func newTestClient(t *testing.T, baseURL string, timeout time.Duration) *Client {
|
||||
t.Helper()
|
||||
client, err := NewClient(Config{BaseURL: baseURL, RequestTimeout: timeout})
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
return client
|
||||
}
|
||||
|
||||
func TestNewClientValidatesConfig(t *testing.T) {
|
||||
cases := map[string]Config{
|
||||
"empty base url": {BaseURL: "", RequestTimeout: time.Second},
|
||||
"non-absolute base url": {BaseURL: "lobby:8095", RequestTimeout: time.Second},
|
||||
"non-positive timeout": {BaseURL: "http://lobby:8095", RequestTimeout: 0},
|
||||
}
|
||||
for name, cfg := range cases {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
_, err := NewClient(cfg)
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetGameSuccess(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
require.Equal(t, http.MethodGet, r.Method)
|
||||
require.Equal(t, "/api/v1/internal/games/game-1", r.URL.Path)
|
||||
require.Equal(t, "application/json", r.Header.Get("Accept"))
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{
|
||||
"game_id": "game-1",
|
||||
"game_name": "Sample",
|
||||
"status": "running",
|
||||
"target_engine_version": "1.4.2",
|
||||
"current_turn": 0,
|
||||
"runtime_status": "running"
|
||||
}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
got, err := client.GetGame(context.Background(), "game-1")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "game-1", got.GameID)
|
||||
assert.Equal(t, "running", got.Status)
|
||||
assert.Equal(t, "1.4.2", got.TargetEngineVersion)
|
||||
}
|
||||
|
||||
func TestGetGameNotFound(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
_, _ = w.Write([]byte(`{"error":{"code":"not_found","message":"no such game"}}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
_, err := client.GetGame(context.Background(), "missing")
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, ports.ErrLobbyGameNotFound))
|
||||
assert.False(t, errors.Is(err, ports.ErrLobbyUnavailable))
|
||||
}
|
||||
|
||||
func TestGetGameInternalErrorMapsToUnavailable(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
_, _ = w.Write([]byte(`{"error":{"code":"internal_error","message":"boom"}}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
_, err := client.GetGame(context.Background(), "x")
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
|
||||
assert.Contains(t, err.Error(), "500")
|
||||
assert.Contains(t, err.Error(), "internal_error")
|
||||
}
|
||||
|
||||
func TestGetGameTimeoutMapsToUnavailable(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(`{}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, 50*time.Millisecond)
|
||||
_, err := client.GetGame(context.Background(), "x")
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
|
||||
}
|
||||
|
||||
func TestGetGameSuccessMissingGameIDIsUnavailable(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(`{"status":"running"}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
_, err := client.GetGame(context.Background(), "x")
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, ports.ErrLobbyUnavailable))
|
||||
assert.Contains(t, err.Error(), "missing game_id")
|
||||
}
|
||||
|
||||
func TestGetGameRejectsBadInput(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
t.Fatal("must not contact lobby on bad input")
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
t.Run("empty game id", func(t *testing.T) {
|
||||
_, err := client.GetGame(context.Background(), " ")
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "game id")
|
||||
})
|
||||
t.Run("canceled context", func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
_, err := client.GetGame(ctx, "x")
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, context.Canceled))
|
||||
})
|
||||
}
|
||||
|
||||
func TestCloseReleasesConnections(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(`{"game_id":"x","status":"running","target_engine_version":"1.0.0"}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
client := newTestClient(t, server.URL, time.Second)
|
||||
_, err := client.GetGame(context.Background(), "x")
|
||||
require.NoError(t, err)
|
||||
assert.NoError(t, client.Close())
|
||||
assert.NoError(t, client.Close()) // idempotent
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// Package notificationpublisher provides the Redis-Streams-backed
|
||||
// notification-intent publisher Runtime Manager uses to emit admin-only
|
||||
// failure notifications. The adapter is a thin shim over
|
||||
// `galaxy/notificationintent.Publisher` that drops the entry id at the
|
||||
// wrapper boundary; rationale lives in
|
||||
// `rtmanager/docs/domain-and-ports.md §7`.
|
||||
package notificationpublisher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
// Config groups the dependencies and stream name required to
|
||||
// construct a Publisher.
|
||||
type Config struct {
|
||||
// Client appends entries to Redis Streams. Must be non-nil.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Stream key intents are published to.
|
||||
// When empty, `notificationintent.DefaultIntentsStream` is used.
|
||||
Stream string
|
||||
}
|
||||
|
||||
// Publisher implements `ports.NotificationIntentPublisher` on top of
|
||||
// the shared `notificationintent.Publisher`. The wrapper is the single
|
||||
// point that drops the entry id returned by the underlying publisher.
|
||||
type Publisher struct {
|
||||
inner *notificationintent.Publisher
|
||||
}
|
||||
|
||||
// NewPublisher constructs a Publisher from cfg. It wraps the shared
|
||||
// publisher and delegates validation; transport errors and validation
|
||||
// errors propagate verbatim.
|
||||
func NewPublisher(cfg Config) (*Publisher, error) {
|
||||
if cfg.Client == nil {
|
||||
return nil, errors.New("new rtmanager notification publisher: nil redis client")
|
||||
}
|
||||
inner, err := notificationintent.NewPublisher(notificationintent.PublisherConfig{
|
||||
Client: cfg.Client,
|
||||
Stream: cfg.Stream,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager notification publisher: %w", err)
|
||||
}
|
||||
return &Publisher{inner: inner}, nil
|
||||
}
|
||||
|
||||
// Publish forwards intent to the underlying notificationintent
|
||||
// publisher and discards the resulting Redis Stream entry id. A failed
|
||||
// publish surfaces as the underlying error.
|
||||
func (publisher *Publisher) Publish(ctx context.Context, intent notificationintent.Intent) error {
|
||||
if publisher == nil || publisher.inner == nil {
|
||||
return errors.New("publish notification intent: nil publisher")
|
||||
}
|
||||
if _, err := publisher.inner.Publish(ctx, intent); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion: Publisher implements
|
||||
// ports.NotificationIntentPublisher.
|
||||
var _ ports.NotificationIntentPublisher = (*Publisher)(nil)
|
||||
@@ -0,0 +1,123 @@
|
||||
package notificationpublisher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
)
|
||||
|
||||
func newRedis(t *testing.T) (*redis.Client, *miniredis.Miniredis) {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
return client, server
|
||||
}
|
||||
|
||||
func readStream(t *testing.T, client *redis.Client, stream string) []redis.XMessage {
|
||||
t.Helper()
|
||||
messages, err := client.XRange(context.Background(), stream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
return messages
|
||||
}
|
||||
|
||||
func TestNewPublisherValidation(t *testing.T) {
|
||||
t.Run("nil client", func(t *testing.T) {
|
||||
_, err := NewPublisher(Config{})
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "nil redis client")
|
||||
})
|
||||
}
|
||||
|
||||
func TestPublisherWritesIntent(t *testing.T) {
|
||||
client, _ := newRedis(t)
|
||||
|
||||
publisher, err := NewPublisher(Config{Client: client, Stream: "notification:intents"})
|
||||
require.NoError(t, err)
|
||||
|
||||
intent, err := notificationintent.NewRuntimeImagePullFailedIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: "rtmanager:start:game-1:abc",
|
||||
OccurredAt: time.UnixMilli(1714200000000).UTC(),
|
||||
},
|
||||
notificationintent.RuntimeImagePullFailedPayload{
|
||||
GameID: "game-1",
|
||||
ImageRef: "galaxy/game:1.4.2",
|
||||
ErrorCode: "image_pull_failed",
|
||||
ErrorMessage: "registry timeout",
|
||||
AttemptedAtMs: 1714200000000,
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, publisher.Publish(context.Background(), intent))
|
||||
|
||||
messages := readStream(t, client, "notification:intents")
|
||||
require.Len(t, messages, 1)
|
||||
|
||||
values := messages[0].Values
|
||||
assert.Equal(t, "runtime.image_pull_failed", values["notification_type"])
|
||||
assert.Equal(t, "runtime_manager", values["producer"])
|
||||
assert.Equal(t, "admin_email", values["audience_kind"])
|
||||
assert.Equal(t, "rtmanager:start:game-1:abc", values["idempotency_key"])
|
||||
|
||||
// recipient_user_ids_json must be absent for admin_email audience.
|
||||
_, hasRecipients := values["recipient_user_ids_json"]
|
||||
assert.False(t, hasRecipients)
|
||||
|
||||
payloadRaw, ok := values["payload_json"].(string)
|
||||
require.True(t, ok)
|
||||
var payload map[string]any
|
||||
require.NoError(t, json.Unmarshal([]byte(payloadRaw), &payload))
|
||||
assert.Equal(t, "game-1", payload["game_id"])
|
||||
assert.Equal(t, "galaxy/game:1.4.2", payload["image_ref"])
|
||||
}
|
||||
|
||||
func TestPublisherForwardsValidationError(t *testing.T) {
|
||||
client, _ := newRedis(t)
|
||||
publisher, err := NewPublisher(Config{Client: client})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Intent with a zero OccurredAt fails the shared validator.
|
||||
bad := notificationintent.Intent{
|
||||
NotificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
|
||||
Producer: notificationintent.ProducerRuntimeManager,
|
||||
AudienceKind: notificationintent.AudienceKindAdminEmail,
|
||||
IdempotencyKey: "k",
|
||||
PayloadJSON: `{"game_id":"g","image_ref":"r","error_code":"c","error_message":"m","attempted_at_ms":1}`,
|
||||
}
|
||||
require.Error(t, publisher.Publish(context.Background(), bad))
|
||||
}
|
||||
|
||||
func TestPublisherDefaultsStreamName(t *testing.T) {
|
||||
client, _ := newRedis(t)
|
||||
publisher, err := NewPublisher(Config{Client: client, Stream: ""})
|
||||
require.NoError(t, err)
|
||||
|
||||
intent, err := notificationintent.NewRuntimeContainerStartFailedIntent(
|
||||
notificationintent.Metadata{
|
||||
IdempotencyKey: "k",
|
||||
OccurredAt: time.UnixMilli(1714200000000).UTC(),
|
||||
},
|
||||
notificationintent.RuntimeContainerStartFailedPayload{
|
||||
GameID: "g",
|
||||
ImageRef: "r",
|
||||
ErrorCode: "container_start_failed",
|
||||
ErrorMessage: "boom",
|
||||
AttemptedAtMs: 1714200000000,
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, publisher.Publish(context.Background(), intent))
|
||||
|
||||
messages := readStream(t, client, notificationintent.DefaultIntentsStream)
|
||||
require.Len(t, messages, 1)
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
// Package healthsnapshotstore implements the PostgreSQL-backed adapter
|
||||
// for `ports.HealthSnapshotStore`.
|
||||
//
|
||||
// The package owns the on-disk shape of the `health_snapshots` table
|
||||
// defined in
|
||||
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
|
||||
// and translates the schema-agnostic `ports.HealthSnapshotStore` interface
|
||||
// declared in `internal/ports/healthsnapshotstore.go` into concrete
|
||||
// go-jet/v2 statements driven by the pgx driver.
|
||||
//
|
||||
// The `details` jsonb column round-trips as a `json.RawMessage`. Empty
|
||||
// payloads are substituted with the SQL default `{}` on Upsert so the
|
||||
// CHECK constraints and downstream readers never observe a non-JSON
|
||||
// empty string.
|
||||
package healthsnapshotstore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
|
||||
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
pg "github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
// emptyDetails is the canonical jsonb payload installed when the caller
|
||||
// supplies an empty Details slice. It matches the SQL DEFAULT for the
|
||||
// column.
|
||||
const emptyDetails = "{}"
|
||||
|
||||
// Config configures one PostgreSQL-backed health-snapshot store instance.
|
||||
type Config struct {
|
||||
// DB stores the connection pool the store uses for every query.
|
||||
DB *sql.DB
|
||||
|
||||
// OperationTimeout bounds one round trip.
|
||||
OperationTimeout time.Duration
|
||||
}
|
||||
|
||||
// Store persists Runtime Manager health snapshots in PostgreSQL.
|
||||
type Store struct {
|
||||
db *sql.DB
|
||||
operationTimeout time.Duration
|
||||
}
|
||||
|
||||
// New constructs one PostgreSQL-backed health-snapshot store from cfg.
|
||||
func New(cfg Config) (*Store, error) {
|
||||
if cfg.DB == nil {
|
||||
return nil, errors.New("new postgres health snapshot store: db must not be nil")
|
||||
}
|
||||
if cfg.OperationTimeout <= 0 {
|
||||
return nil, errors.New("new postgres health snapshot store: operation timeout must be positive")
|
||||
}
|
||||
return &Store{
|
||||
db: cfg.DB,
|
||||
operationTimeout: cfg.OperationTimeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// healthSnapshotSelectColumns is the canonical SELECT list for the
|
||||
// health_snapshots table, matching scanSnapshot's column order.
|
||||
var healthSnapshotSelectColumns = pg.ColumnList{
|
||||
pgtable.HealthSnapshots.GameID,
|
||||
pgtable.HealthSnapshots.ContainerID,
|
||||
pgtable.HealthSnapshots.Status,
|
||||
pgtable.HealthSnapshots.Source,
|
||||
pgtable.HealthSnapshots.Details,
|
||||
pgtable.HealthSnapshots.ObservedAt,
|
||||
}
|
||||
|
||||
// Upsert installs snapshot as the latest observation for snapshot.GameID.
|
||||
// snapshot is validated through health.HealthSnapshot.Validate before the
|
||||
// SQL is issued.
|
||||
func (store *Store) Upsert(ctx context.Context, snapshot health.HealthSnapshot) error {
|
||||
if store == nil || store.db == nil {
|
||||
return errors.New("upsert health snapshot: nil store")
|
||||
}
|
||||
if err := snapshot.Validate(); err != nil {
|
||||
return fmt.Errorf("upsert health snapshot: %w", err)
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert health snapshot", store.operationTimeout)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
details := emptyDetails
|
||||
if len(snapshot.Details) > 0 {
|
||||
details = string(snapshot.Details)
|
||||
}
|
||||
|
||||
stmt := pgtable.HealthSnapshots.INSERT(
|
||||
pgtable.HealthSnapshots.GameID,
|
||||
pgtable.HealthSnapshots.ContainerID,
|
||||
pgtable.HealthSnapshots.Status,
|
||||
pgtable.HealthSnapshots.Source,
|
||||
pgtable.HealthSnapshots.Details,
|
||||
pgtable.HealthSnapshots.ObservedAt,
|
||||
).VALUES(
|
||||
snapshot.GameID,
|
||||
snapshot.ContainerID,
|
||||
string(snapshot.Status),
|
||||
string(snapshot.Source),
|
||||
details,
|
||||
snapshot.ObservedAt.UTC(),
|
||||
).ON_CONFLICT(pgtable.HealthSnapshots.GameID).DO_UPDATE(
|
||||
pg.SET(
|
||||
pgtable.HealthSnapshots.ContainerID.SET(pgtable.HealthSnapshots.EXCLUDED.ContainerID),
|
||||
pgtable.HealthSnapshots.Status.SET(pgtable.HealthSnapshots.EXCLUDED.Status),
|
||||
pgtable.HealthSnapshots.Source.SET(pgtable.HealthSnapshots.EXCLUDED.Source),
|
||||
pgtable.HealthSnapshots.Details.SET(pgtable.HealthSnapshots.EXCLUDED.Details),
|
||||
pgtable.HealthSnapshots.ObservedAt.SET(pgtable.HealthSnapshots.EXCLUDED.ObservedAt),
|
||||
),
|
||||
)
|
||||
|
||||
query, args := stmt.Sql()
|
||||
if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil {
|
||||
return fmt.Errorf("upsert health snapshot: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get returns the latest snapshot for gameID. It returns
|
||||
// runtime.ErrNotFound when no snapshot has been recorded yet.
|
||||
func (store *Store) Get(ctx context.Context, gameID string) (health.HealthSnapshot, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return health.HealthSnapshot{}, errors.New("get health snapshot: nil store")
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: game id must not be empty")
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get health snapshot", store.operationTimeout)
|
||||
if err != nil {
|
||||
return health.HealthSnapshot{}, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pg.SELECT(healthSnapshotSelectColumns).
|
||||
FROM(pgtable.HealthSnapshots).
|
||||
WHERE(pgtable.HealthSnapshots.GameID.EQ(pg.String(gameID)))
|
||||
|
||||
query, args := stmt.Sql()
|
||||
row := store.db.QueryRowContext(operationCtx, query, args...)
|
||||
snapshot, err := scanSnapshot(row)
|
||||
if sqlx.IsNoRows(err) {
|
||||
return health.HealthSnapshot{}, runtime.ErrNotFound
|
||||
}
|
||||
if err != nil {
|
||||
return health.HealthSnapshot{}, fmt.Errorf("get health snapshot: %w", err)
|
||||
}
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
// rowScanner abstracts *sql.Row and *sql.Rows so scanSnapshot can be
|
||||
// shared across both single-row reads and iterated reads.
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
// scanSnapshot scans one health_snapshots row from rs.
|
||||
func scanSnapshot(rs rowScanner) (health.HealthSnapshot, error) {
|
||||
var (
|
||||
gameID string
|
||||
containerID string
|
||||
status string
|
||||
source string
|
||||
details []byte
|
||||
observedAt time.Time
|
||||
)
|
||||
if err := rs.Scan(
|
||||
&gameID,
|
||||
&containerID,
|
||||
&status,
|
||||
&source,
|
||||
&details,
|
||||
&observedAt,
|
||||
); err != nil {
|
||||
return health.HealthSnapshot{}, err
|
||||
}
|
||||
return health.HealthSnapshot{
|
||||
GameID: gameID,
|
||||
ContainerID: containerID,
|
||||
Status: health.SnapshotStatus(status),
|
||||
Source: health.SnapshotSource(source),
|
||||
Details: json.RawMessage(details),
|
||||
ObservedAt: observedAt.UTC(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Ensure Store satisfies the ports.HealthSnapshotStore interface at
|
||||
// compile time.
|
||||
var _ ports.HealthSnapshotStore = (*Store)(nil)
|
||||
@@ -0,0 +1,157 @@
|
||||
package healthsnapshotstore_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) { pgtest.RunMain(m) }
|
||||
|
||||
func newStore(t *testing.T) *healthsnapshotstore.Store {
|
||||
t.Helper()
|
||||
pgtest.TruncateAll(t)
|
||||
store, err := healthsnapshotstore.New(healthsnapshotstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
OperationTimeout: pgtest.OperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return store
|
||||
}
|
||||
|
||||
func probeFailedSnapshot(gameID string, observedAt time.Time) health.HealthSnapshot {
|
||||
return health.HealthSnapshot{
|
||||
GameID: gameID,
|
||||
ContainerID: "container-1",
|
||||
Status: health.SnapshotStatusProbeFailed,
|
||||
Source: health.SnapshotSourceProbe,
|
||||
Details: json.RawMessage(`{"consecutive_failures":3,"last_status":503,"last_error":"timeout"}`),
|
||||
ObservedAt: observedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpsertAndGetRoundTrip(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
snapshot := probeFailedSnapshot("game-001",
|
||||
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
|
||||
require.NoError(t, store.Upsert(ctx, snapshot))
|
||||
|
||||
got, err := store.Get(ctx, "game-001")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, snapshot.GameID, got.GameID)
|
||||
assert.Equal(t, snapshot.ContainerID, got.ContainerID)
|
||||
assert.Equal(t, snapshot.Status, got.Status)
|
||||
assert.Equal(t, snapshot.Source, got.Source)
|
||||
assert.JSONEq(t, string(snapshot.Details), string(got.Details))
|
||||
assert.True(t, snapshot.ObservedAt.Equal(got.ObservedAt))
|
||||
assert.Equal(t, time.UTC, got.ObservedAt.Location())
|
||||
}
|
||||
|
||||
func TestUpsertOverwritesPriorSnapshot(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
first := probeFailedSnapshot("game-001",
|
||||
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
|
||||
require.NoError(t, store.Upsert(ctx, first))
|
||||
|
||||
second := health.HealthSnapshot{
|
||||
GameID: "game-001",
|
||||
ContainerID: "container-2",
|
||||
Status: health.SnapshotStatusHealthy,
|
||||
Source: health.SnapshotSourceInspect,
|
||||
Details: json.RawMessage(`{"restart_count":0,"state":"running"}`),
|
||||
ObservedAt: first.ObservedAt.Add(time.Minute),
|
||||
}
|
||||
require.NoError(t, store.Upsert(ctx, second))
|
||||
|
||||
got, err := store.Get(ctx, "game-001")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "container-2", got.ContainerID)
|
||||
assert.Equal(t, health.SnapshotStatusHealthy, got.Status)
|
||||
assert.Equal(t, health.SnapshotSourceInspect, got.Source)
|
||||
assert.JSONEq(t, string(second.Details), string(got.Details))
|
||||
assert.True(t, second.ObservedAt.Equal(got.ObservedAt))
|
||||
}
|
||||
|
||||
func TestGetReturnsNotFound(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
_, err := store.Get(ctx, "game-missing")
|
||||
require.ErrorIs(t, err, runtime.ErrNotFound)
|
||||
}
|
||||
|
||||
func TestUpsertEmptyDetailsRoundTripsAsEmptyObject(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
snapshot := probeFailedSnapshot("game-001",
|
||||
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
|
||||
snapshot.Details = nil
|
||||
require.NoError(t, store.Upsert(ctx, snapshot))
|
||||
|
||||
got, err := store.Get(ctx, "game-001")
|
||||
require.NoError(t, err)
|
||||
assert.JSONEq(t, "{}", string(got.Details),
|
||||
"empty json.RawMessage must round-trip as the SQL default {}, got %q",
|
||||
string(got.Details))
|
||||
}
|
||||
|
||||
func TestUpsertValidatesSnapshot(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*health.HealthSnapshot)
|
||||
}{
|
||||
{"empty game id", func(s *health.HealthSnapshot) { s.GameID = "" }},
|
||||
{"unknown status", func(s *health.HealthSnapshot) { s.Status = "exotic" }},
|
||||
{"unknown source", func(s *health.HealthSnapshot) { s.Source = "exotic" }},
|
||||
{"zero observed at", func(s *health.HealthSnapshot) { s.ObservedAt = time.Time{} }},
|
||||
{"invalid json details", func(s *health.HealthSnapshot) {
|
||||
s.Details = json.RawMessage("not json")
|
||||
}},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
snapshot := probeFailedSnapshot("game-001",
|
||||
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC))
|
||||
tt.mutate(&snapshot)
|
||||
err := store.Upsert(ctx, snapshot)
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetRejectsEmptyGameID(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
_, err := store.Get(ctx, "")
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestNewRejectsNilDB(t *testing.T) {
|
||||
_, err := healthsnapshotstore.New(healthsnapshotstore.Config{OperationTimeout: time.Second})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
|
||||
_, err := healthsnapshotstore.New(healthsnapshotstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,209 @@
|
||||
// Package pgtest exposes the testcontainers-backed PostgreSQL bootstrap
|
||||
// shared by every Runtime Manager PG adapter test. The package is regular
|
||||
// Go code — not a `_test.go` file — so it can be imported by the
|
||||
// `_test.go` files in the three sibling store packages
|
||||
// (`runtimerecordstore`, `operationlogstore`, `healthsnapshotstore`).
|
||||
//
|
||||
// No production code in `cmd/rtmanager` or in the runtime imports this
|
||||
// package. The testcontainers-go dependency therefore stays out of the
|
||||
// production binary's import graph.
|
||||
package pgtest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"net/url"
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/migrations"
|
||||
|
||||
testcontainers "github.com/testcontainers/testcontainers-go"
|
||||
tcpostgres "github.com/testcontainers/testcontainers-go/modules/postgres"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
)
|
||||
|
||||
const (
|
||||
postgresImage = "postgres:16-alpine"
|
||||
superUser = "galaxy"
|
||||
superPassword = "galaxy"
|
||||
superDatabase = "galaxy_rtmanager"
|
||||
serviceRole = "rtmanagerservice"
|
||||
servicePassword = "rtmanagerservice"
|
||||
serviceSchema = "rtmanager"
|
||||
containerStartup = 90 * time.Second
|
||||
|
||||
// OperationTimeout is the per-statement timeout used by every store
|
||||
// constructed via the per-package newStore helpers. Tests may pass a
|
||||
// smaller value if they need to assert deadline behaviour explicitly.
|
||||
OperationTimeout = 10 * time.Second
|
||||
)
|
||||
|
||||
// Env holds the per-process container plus the *sql.DB pool already
|
||||
// provisioned with the rtmanager schema, role, and migrations applied.
|
||||
type Env struct {
|
||||
container *tcpostgres.PostgresContainer
|
||||
pool *sql.DB
|
||||
}
|
||||
|
||||
// Pool returns the shared pool. Tests truncate per-table state before
|
||||
// each run via TruncateAll.
|
||||
func (env *Env) Pool() *sql.DB { return env.pool }
|
||||
|
||||
var (
|
||||
once sync.Once
|
||||
cur *Env
|
||||
curEr error
|
||||
)
|
||||
|
||||
// Ensure starts the PostgreSQL container on first invocation and applies
|
||||
// the embedded goose migrations. Subsequent invocations reuse the same
|
||||
// container/pool. When Docker is unavailable Ensure calls t.Skip with the
|
||||
// underlying error so the test suite still passes on machines without
|
||||
// Docker.
|
||||
func Ensure(t testing.TB) *Env {
|
||||
t.Helper()
|
||||
once.Do(func() {
|
||||
cur, curEr = start()
|
||||
})
|
||||
if curEr != nil {
|
||||
t.Skipf("postgres container start failed (Docker unavailable?): %v", curEr)
|
||||
}
|
||||
return cur
|
||||
}
|
||||
|
||||
// TruncateAll wipes every Runtime Manager table inside the shared pool,
|
||||
// leaving the schema and indexes intact. Use it from each test that needs
|
||||
// a clean slate.
|
||||
func TruncateAll(t testing.TB) {
|
||||
t.Helper()
|
||||
env := Ensure(t)
|
||||
const stmt = `TRUNCATE TABLE runtime_records, operation_log, health_snapshots RESTART IDENTITY CASCADE`
|
||||
if _, err := env.pool.ExecContext(context.Background(), stmt); err != nil {
|
||||
t.Fatalf("truncate rtmanager tables: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown terminates the shared container and closes the pool. It is
|
||||
// invoked from each test package's TestMain after `m.Run` returns so the
|
||||
// container is released even if individual tests panic.
|
||||
func Shutdown() {
|
||||
if cur == nil {
|
||||
return
|
||||
}
|
||||
if cur.pool != nil {
|
||||
_ = cur.pool.Close()
|
||||
}
|
||||
if cur.container != nil {
|
||||
_ = testcontainers.TerminateContainer(cur.container)
|
||||
}
|
||||
cur = nil
|
||||
}
|
||||
|
||||
// RunMain is a convenience helper for each store package's TestMain: it
|
||||
// runs the test main, captures the exit code, shuts the container down,
|
||||
// and exits. Wiring it through one helper keeps every TestMain to two
|
||||
// lines.
|
||||
func RunMain(m *testing.M) {
|
||||
code := m.Run()
|
||||
Shutdown()
|
||||
os.Exit(code)
|
||||
}
|
||||
|
||||
func start() (*Env, error) {
|
||||
ctx := context.Background()
|
||||
container, err := tcpostgres.Run(ctx, postgresImage,
|
||||
tcpostgres.WithDatabase(superDatabase),
|
||||
tcpostgres.WithUsername(superUser),
|
||||
tcpostgres.WithPassword(superPassword),
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("database system is ready to accept connections").
|
||||
WithOccurrence(2).
|
||||
WithStartupTimeout(containerStartup),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
baseDSN, err := container.ConnectionString(ctx, "sslmode=disable")
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := provisionRoleAndSchema(ctx, baseDSN); err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
scopedDSN, err := dsnForServiceRole(baseDSN)
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
cfg := postgres.DefaultConfig()
|
||||
cfg.PrimaryDSN = scopedDSN
|
||||
cfg.OperationTimeout = OperationTimeout
|
||||
pool, err := postgres.OpenPrimary(ctx, cfg)
|
||||
if err != nil {
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := postgres.Ping(ctx, pool, OperationTimeout); err != nil {
|
||||
_ = pool.Close()
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
if err := postgres.RunMigrations(ctx, pool, migrations.FS(), "."); err != nil {
|
||||
_ = pool.Close()
|
||||
_ = testcontainers.TerminateContainer(container)
|
||||
return nil, err
|
||||
}
|
||||
return &Env{container: container, pool: pool}, nil
|
||||
}
|
||||
|
||||
func provisionRoleAndSchema(ctx context.Context, baseDSN string) error {
|
||||
cfg := postgres.DefaultConfig()
|
||||
cfg.PrimaryDSN = baseDSN
|
||||
cfg.OperationTimeout = OperationTimeout
|
||||
db, err := postgres.OpenPrimary(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = db.Close() }()
|
||||
|
||||
statements := []string{
|
||||
`DO $$ BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'rtmanagerservice') THEN
|
||||
CREATE ROLE rtmanagerservice LOGIN PASSWORD 'rtmanagerservice';
|
||||
END IF;
|
||||
END $$;`,
|
||||
`CREATE SCHEMA IF NOT EXISTS rtmanager AUTHORIZATION rtmanagerservice;`,
|
||||
`GRANT USAGE ON SCHEMA rtmanager TO rtmanagerservice;`,
|
||||
}
|
||||
for _, statement := range statements {
|
||||
if _, err := db.ExecContext(ctx, statement); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func dsnForServiceRole(baseDSN string) (string, error) {
|
||||
parsed, err := url.Parse(baseDSN)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
values := url.Values{}
|
||||
values.Set("search_path", serviceSchema)
|
||||
values.Set("sslmode", "disable")
|
||||
scoped := url.URL{
|
||||
Scheme: parsed.Scheme,
|
||||
User: url.UserPassword(serviceRole, servicePassword),
|
||||
Host: parsed.Host,
|
||||
Path: parsed.Path,
|
||||
RawQuery: values.Encode(),
|
||||
}
|
||||
return scoped.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
// Package sqlx contains the small set of helpers shared by every Runtime
|
||||
// Manager PostgreSQL adapter (runtimerecordstore, operationlogstore,
|
||||
// healthsnapshotstore). The helpers centralise the boundary translations
|
||||
// for nullable timestamps and the pgx SQLSTATE codes the adapters
|
||||
// interpret as domain conflicts.
|
||||
package sqlx
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgconn"
|
||||
)
|
||||
|
||||
// PgUniqueViolationCode identifies the SQLSTATE returned by PostgreSQL
|
||||
// when a UNIQUE constraint is violated by INSERT or UPDATE.
|
||||
const PgUniqueViolationCode = "23505"
|
||||
|
||||
// IsUniqueViolation reports whether err is a PostgreSQL unique-violation,
|
||||
// regardless of constraint name.
|
||||
func IsUniqueViolation(err error) bool {
|
||||
var pgErr *pgconn.PgError
|
||||
if !errors.As(err, &pgErr) {
|
||||
return false
|
||||
}
|
||||
return pgErr.Code == PgUniqueViolationCode
|
||||
}
|
||||
|
||||
// IsNoRows reports whether err is sql.ErrNoRows.
|
||||
func IsNoRows(err error) bool {
|
||||
return errors.Is(err, sql.ErrNoRows)
|
||||
}
|
||||
|
||||
// NullableTime returns t.UTC() when non-zero, otherwise nil so the column
|
||||
// is bound as SQL NULL.
|
||||
func NullableTime(t time.Time) any {
|
||||
if t.IsZero() {
|
||||
return nil
|
||||
}
|
||||
return t.UTC()
|
||||
}
|
||||
|
||||
// NullableTimePtr returns t.UTC() when t is non-nil and non-zero, otherwise
|
||||
// nil. Companion of NullableTime for domain types that use *time.Time to
|
||||
// express absent timestamps.
|
||||
func NullableTimePtr(t *time.Time) any {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
return NullableTime(*t)
|
||||
}
|
||||
|
||||
// NullableString returns value when non-empty, otherwise nil so the column
|
||||
// is bound as SQL NULL. Used for Runtime Manager columns that map empty
|
||||
// domain strings to NULL (current_container_id, current_image_ref).
|
||||
func NullableString(value string) any {
|
||||
if value == "" {
|
||||
return nil
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// StringFromNullable copies an optional sql.NullString into a domain
|
||||
// string. NULL becomes the empty string, matching the Runtime Manager
|
||||
// domain convention that empty == NULL for nullable text columns.
|
||||
func StringFromNullable(value sql.NullString) string {
|
||||
if !value.Valid {
|
||||
return ""
|
||||
}
|
||||
return value.String
|
||||
}
|
||||
|
||||
// TimeFromNullable copies an optional sql.NullTime into a domain
|
||||
// time.Time, applying the global UTC normalisation rule. NULL values
|
||||
// become the zero time.Time.
|
||||
func TimeFromNullable(value sql.NullTime) time.Time {
|
||||
if !value.Valid {
|
||||
return time.Time{}
|
||||
}
|
||||
return value.Time.UTC()
|
||||
}
|
||||
|
||||
// TimePtrFromNullable copies an optional sql.NullTime into a domain
|
||||
// *time.Time. NULL becomes nil; non-NULL values are wrapped after UTC
|
||||
// normalisation.
|
||||
func TimePtrFromNullable(value sql.NullTime) *time.Time {
|
||||
if !value.Valid {
|
||||
return nil
|
||||
}
|
||||
t := value.Time.UTC()
|
||||
return &t
|
||||
}
|
||||
|
||||
// WithTimeout derives a child context bounded by timeout and prefixes
|
||||
// context errors with operation. Callers must always invoke the returned
|
||||
// cancel.
|
||||
func WithTimeout(ctx context.Context, operation string, timeout time.Duration) (context.Context, context.CancelFunc, error) {
|
||||
if ctx == nil {
|
||||
return nil, nil, fmt.Errorf("%s: nil context", operation)
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, nil, fmt.Errorf("%s: %w", operation, err)
|
||||
}
|
||||
if timeout <= 0 {
|
||||
return nil, nil, fmt.Errorf("%s: operation timeout must be positive", operation)
|
||||
}
|
||||
bounded, cancel := context.WithTimeout(ctx, timeout)
|
||||
return bounded, cancel, nil
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type GooseDbVersion struct {
|
||||
ID int32 `sql:"primary_key"`
|
||||
VersionID int64
|
||||
IsApplied bool
|
||||
Tstamp time.Time
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type HealthSnapshots struct {
|
||||
GameID string `sql:"primary_key"`
|
||||
ContainerID string
|
||||
Status string
|
||||
Source string
|
||||
Details string
|
||||
ObservedAt time.Time
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type OperationLog struct {
|
||||
ID int64 `sql:"primary_key"`
|
||||
GameID string
|
||||
OpKind string
|
||||
OpSource string
|
||||
SourceRef string
|
||||
ImageRef string
|
||||
ContainerID string
|
||||
Outcome string
|
||||
ErrorCode string
|
||||
ErrorMessage string
|
||||
StartedAt time.Time
|
||||
FinishedAt *time.Time
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type RuntimeRecords struct {
|
||||
GameID string `sql:"primary_key"`
|
||||
Status string
|
||||
CurrentContainerID *string
|
||||
CurrentImageRef *string
|
||||
EngineEndpoint string
|
||||
StatePath string
|
||||
DockerNetwork string
|
||||
StartedAt *time.Time
|
||||
StoppedAt *time.Time
|
||||
RemovedAt *time.Time
|
||||
LastOpAt time.Time
|
||||
CreatedAt time.Time
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
var GooseDbVersion = newGooseDbVersionTable("rtmanager", "goose_db_version", "")
|
||||
|
||||
type gooseDbVersionTable struct {
|
||||
postgres.Table
|
||||
|
||||
// Columns
|
||||
ID postgres.ColumnInteger
|
||||
VersionID postgres.ColumnInteger
|
||||
IsApplied postgres.ColumnBool
|
||||
Tstamp postgres.ColumnTimestamp
|
||||
|
||||
AllColumns postgres.ColumnList
|
||||
MutableColumns postgres.ColumnList
|
||||
DefaultColumns postgres.ColumnList
|
||||
}
|
||||
|
||||
type GooseDbVersionTable struct {
|
||||
gooseDbVersionTable
|
||||
|
||||
EXCLUDED gooseDbVersionTable
|
||||
}
|
||||
|
||||
// AS creates new GooseDbVersionTable with assigned alias
|
||||
func (a GooseDbVersionTable) AS(alias string) *GooseDbVersionTable {
|
||||
return newGooseDbVersionTable(a.SchemaName(), a.TableName(), alias)
|
||||
}
|
||||
|
||||
// Schema creates new GooseDbVersionTable with assigned schema name
|
||||
func (a GooseDbVersionTable) FromSchema(schemaName string) *GooseDbVersionTable {
|
||||
return newGooseDbVersionTable(schemaName, a.TableName(), a.Alias())
|
||||
}
|
||||
|
||||
// WithPrefix creates new GooseDbVersionTable with assigned table prefix
|
||||
func (a GooseDbVersionTable) WithPrefix(prefix string) *GooseDbVersionTable {
|
||||
return newGooseDbVersionTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
|
||||
}
|
||||
|
||||
// WithSuffix creates new GooseDbVersionTable with assigned table suffix
|
||||
func (a GooseDbVersionTable) WithSuffix(suffix string) *GooseDbVersionTable {
|
||||
return newGooseDbVersionTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
|
||||
}
|
||||
|
||||
func newGooseDbVersionTable(schemaName, tableName, alias string) *GooseDbVersionTable {
|
||||
return &GooseDbVersionTable{
|
||||
gooseDbVersionTable: newGooseDbVersionTableImpl(schemaName, tableName, alias),
|
||||
EXCLUDED: newGooseDbVersionTableImpl("", "excluded", ""),
|
||||
}
|
||||
}
|
||||
|
||||
func newGooseDbVersionTableImpl(schemaName, tableName, alias string) gooseDbVersionTable {
|
||||
var (
|
||||
IDColumn = postgres.IntegerColumn("id")
|
||||
VersionIDColumn = postgres.IntegerColumn("version_id")
|
||||
IsAppliedColumn = postgres.BoolColumn("is_applied")
|
||||
TstampColumn = postgres.TimestampColumn("tstamp")
|
||||
allColumns = postgres.ColumnList{IDColumn, VersionIDColumn, IsAppliedColumn, TstampColumn}
|
||||
mutableColumns = postgres.ColumnList{VersionIDColumn, IsAppliedColumn, TstampColumn}
|
||||
defaultColumns = postgres.ColumnList{TstampColumn}
|
||||
)
|
||||
|
||||
return gooseDbVersionTable{
|
||||
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
|
||||
|
||||
//Columns
|
||||
ID: IDColumn,
|
||||
VersionID: VersionIDColumn,
|
||||
IsApplied: IsAppliedColumn,
|
||||
Tstamp: TstampColumn,
|
||||
|
||||
AllColumns: allColumns,
|
||||
MutableColumns: mutableColumns,
|
||||
DefaultColumns: defaultColumns,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
var HealthSnapshots = newHealthSnapshotsTable("rtmanager", "health_snapshots", "")
|
||||
|
||||
type healthSnapshotsTable struct {
|
||||
postgres.Table
|
||||
|
||||
// Columns
|
||||
GameID postgres.ColumnString
|
||||
ContainerID postgres.ColumnString
|
||||
Status postgres.ColumnString
|
||||
Source postgres.ColumnString
|
||||
Details postgres.ColumnString
|
||||
ObservedAt postgres.ColumnTimestampz
|
||||
|
||||
AllColumns postgres.ColumnList
|
||||
MutableColumns postgres.ColumnList
|
||||
DefaultColumns postgres.ColumnList
|
||||
}
|
||||
|
||||
type HealthSnapshotsTable struct {
|
||||
healthSnapshotsTable
|
||||
|
||||
EXCLUDED healthSnapshotsTable
|
||||
}
|
||||
|
||||
// AS creates new HealthSnapshotsTable with assigned alias
|
||||
func (a HealthSnapshotsTable) AS(alias string) *HealthSnapshotsTable {
|
||||
return newHealthSnapshotsTable(a.SchemaName(), a.TableName(), alias)
|
||||
}
|
||||
|
||||
// Schema creates new HealthSnapshotsTable with assigned schema name
|
||||
func (a HealthSnapshotsTable) FromSchema(schemaName string) *HealthSnapshotsTable {
|
||||
return newHealthSnapshotsTable(schemaName, a.TableName(), a.Alias())
|
||||
}
|
||||
|
||||
// WithPrefix creates new HealthSnapshotsTable with assigned table prefix
|
||||
func (a HealthSnapshotsTable) WithPrefix(prefix string) *HealthSnapshotsTable {
|
||||
return newHealthSnapshotsTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
|
||||
}
|
||||
|
||||
// WithSuffix creates new HealthSnapshotsTable with assigned table suffix
|
||||
func (a HealthSnapshotsTable) WithSuffix(suffix string) *HealthSnapshotsTable {
|
||||
return newHealthSnapshotsTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
|
||||
}
|
||||
|
||||
func newHealthSnapshotsTable(schemaName, tableName, alias string) *HealthSnapshotsTable {
|
||||
return &HealthSnapshotsTable{
|
||||
healthSnapshotsTable: newHealthSnapshotsTableImpl(schemaName, tableName, alias),
|
||||
EXCLUDED: newHealthSnapshotsTableImpl("", "excluded", ""),
|
||||
}
|
||||
}
|
||||
|
||||
func newHealthSnapshotsTableImpl(schemaName, tableName, alias string) healthSnapshotsTable {
|
||||
var (
|
||||
GameIDColumn = postgres.StringColumn("game_id")
|
||||
ContainerIDColumn = postgres.StringColumn("container_id")
|
||||
StatusColumn = postgres.StringColumn("status")
|
||||
SourceColumn = postgres.StringColumn("source")
|
||||
DetailsColumn = postgres.StringColumn("details")
|
||||
ObservedAtColumn = postgres.TimestampzColumn("observed_at")
|
||||
allColumns = postgres.ColumnList{GameIDColumn, ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn}
|
||||
mutableColumns = postgres.ColumnList{ContainerIDColumn, StatusColumn, SourceColumn, DetailsColumn, ObservedAtColumn}
|
||||
defaultColumns = postgres.ColumnList{ContainerIDColumn, DetailsColumn}
|
||||
)
|
||||
|
||||
return healthSnapshotsTable{
|
||||
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
|
||||
|
||||
//Columns
|
||||
GameID: GameIDColumn,
|
||||
ContainerID: ContainerIDColumn,
|
||||
Status: StatusColumn,
|
||||
Source: SourceColumn,
|
||||
Details: DetailsColumn,
|
||||
ObservedAt: ObservedAtColumn,
|
||||
|
||||
AllColumns: allColumns,
|
||||
MutableColumns: mutableColumns,
|
||||
DefaultColumns: defaultColumns,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
var OperationLog = newOperationLogTable("rtmanager", "operation_log", "")
|
||||
|
||||
type operationLogTable struct {
|
||||
postgres.Table
|
||||
|
||||
// Columns
|
||||
ID postgres.ColumnInteger
|
||||
GameID postgres.ColumnString
|
||||
OpKind postgres.ColumnString
|
||||
OpSource postgres.ColumnString
|
||||
SourceRef postgres.ColumnString
|
||||
ImageRef postgres.ColumnString
|
||||
ContainerID postgres.ColumnString
|
||||
Outcome postgres.ColumnString
|
||||
ErrorCode postgres.ColumnString
|
||||
ErrorMessage postgres.ColumnString
|
||||
StartedAt postgres.ColumnTimestampz
|
||||
FinishedAt postgres.ColumnTimestampz
|
||||
|
||||
AllColumns postgres.ColumnList
|
||||
MutableColumns postgres.ColumnList
|
||||
DefaultColumns postgres.ColumnList
|
||||
}
|
||||
|
||||
type OperationLogTable struct {
|
||||
operationLogTable
|
||||
|
||||
EXCLUDED operationLogTable
|
||||
}
|
||||
|
||||
// AS creates new OperationLogTable with assigned alias
|
||||
func (a OperationLogTable) AS(alias string) *OperationLogTable {
|
||||
return newOperationLogTable(a.SchemaName(), a.TableName(), alias)
|
||||
}
|
||||
|
||||
// Schema creates new OperationLogTable with assigned schema name
|
||||
func (a OperationLogTable) FromSchema(schemaName string) *OperationLogTable {
|
||||
return newOperationLogTable(schemaName, a.TableName(), a.Alias())
|
||||
}
|
||||
|
||||
// WithPrefix creates new OperationLogTable with assigned table prefix
|
||||
func (a OperationLogTable) WithPrefix(prefix string) *OperationLogTable {
|
||||
return newOperationLogTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
|
||||
}
|
||||
|
||||
// WithSuffix creates new OperationLogTable with assigned table suffix
|
||||
func (a OperationLogTable) WithSuffix(suffix string) *OperationLogTable {
|
||||
return newOperationLogTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
|
||||
}
|
||||
|
||||
func newOperationLogTable(schemaName, tableName, alias string) *OperationLogTable {
|
||||
return &OperationLogTable{
|
||||
operationLogTable: newOperationLogTableImpl(schemaName, tableName, alias),
|
||||
EXCLUDED: newOperationLogTableImpl("", "excluded", ""),
|
||||
}
|
||||
}
|
||||
|
||||
func newOperationLogTableImpl(schemaName, tableName, alias string) operationLogTable {
|
||||
var (
|
||||
IDColumn = postgres.IntegerColumn("id")
|
||||
GameIDColumn = postgres.StringColumn("game_id")
|
||||
OpKindColumn = postgres.StringColumn("op_kind")
|
||||
OpSourceColumn = postgres.StringColumn("op_source")
|
||||
SourceRefColumn = postgres.StringColumn("source_ref")
|
||||
ImageRefColumn = postgres.StringColumn("image_ref")
|
||||
ContainerIDColumn = postgres.StringColumn("container_id")
|
||||
OutcomeColumn = postgres.StringColumn("outcome")
|
||||
ErrorCodeColumn = postgres.StringColumn("error_code")
|
||||
ErrorMessageColumn = postgres.StringColumn("error_message")
|
||||
StartedAtColumn = postgres.TimestampzColumn("started_at")
|
||||
FinishedAtColumn = postgres.TimestampzColumn("finished_at")
|
||||
allColumns = postgres.ColumnList{IDColumn, GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn}
|
||||
mutableColumns = postgres.ColumnList{GameIDColumn, OpKindColumn, OpSourceColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, OutcomeColumn, ErrorCodeColumn, ErrorMessageColumn, StartedAtColumn, FinishedAtColumn}
|
||||
defaultColumns = postgres.ColumnList{IDColumn, SourceRefColumn, ImageRefColumn, ContainerIDColumn, ErrorCodeColumn, ErrorMessageColumn}
|
||||
)
|
||||
|
||||
return operationLogTable{
|
||||
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
|
||||
|
||||
//Columns
|
||||
ID: IDColumn,
|
||||
GameID: GameIDColumn,
|
||||
OpKind: OpKindColumn,
|
||||
OpSource: OpSourceColumn,
|
||||
SourceRef: SourceRefColumn,
|
||||
ImageRef: ImageRefColumn,
|
||||
ContainerID: ContainerIDColumn,
|
||||
Outcome: OutcomeColumn,
|
||||
ErrorCode: ErrorCodeColumn,
|
||||
ErrorMessage: ErrorMessageColumn,
|
||||
StartedAt: StartedAtColumn,
|
||||
FinishedAt: FinishedAtColumn,
|
||||
|
||||
AllColumns: allColumns,
|
||||
MutableColumns: mutableColumns,
|
||||
DefaultColumns: defaultColumns,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package table
|
||||
|
||||
import (
|
||||
"github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
var RuntimeRecords = newRuntimeRecordsTable("rtmanager", "runtime_records", "")
|
||||
|
||||
type runtimeRecordsTable struct {
|
||||
postgres.Table
|
||||
|
||||
// Columns
|
||||
GameID postgres.ColumnString
|
||||
Status postgres.ColumnString
|
||||
CurrentContainerID postgres.ColumnString
|
||||
CurrentImageRef postgres.ColumnString
|
||||
EngineEndpoint postgres.ColumnString
|
||||
StatePath postgres.ColumnString
|
||||
DockerNetwork postgres.ColumnString
|
||||
StartedAt postgres.ColumnTimestampz
|
||||
StoppedAt postgres.ColumnTimestampz
|
||||
RemovedAt postgres.ColumnTimestampz
|
||||
LastOpAt postgres.ColumnTimestampz
|
||||
CreatedAt postgres.ColumnTimestampz
|
||||
|
||||
AllColumns postgres.ColumnList
|
||||
MutableColumns postgres.ColumnList
|
||||
DefaultColumns postgres.ColumnList
|
||||
}
|
||||
|
||||
type RuntimeRecordsTable struct {
|
||||
runtimeRecordsTable
|
||||
|
||||
EXCLUDED runtimeRecordsTable
|
||||
}
|
||||
|
||||
// AS creates new RuntimeRecordsTable with assigned alias
|
||||
func (a RuntimeRecordsTable) AS(alias string) *RuntimeRecordsTable {
|
||||
return newRuntimeRecordsTable(a.SchemaName(), a.TableName(), alias)
|
||||
}
|
||||
|
||||
// Schema creates new RuntimeRecordsTable with assigned schema name
|
||||
func (a RuntimeRecordsTable) FromSchema(schemaName string) *RuntimeRecordsTable {
|
||||
return newRuntimeRecordsTable(schemaName, a.TableName(), a.Alias())
|
||||
}
|
||||
|
||||
// WithPrefix creates new RuntimeRecordsTable with assigned table prefix
|
||||
func (a RuntimeRecordsTable) WithPrefix(prefix string) *RuntimeRecordsTable {
|
||||
return newRuntimeRecordsTable(a.SchemaName(), prefix+a.TableName(), a.TableName())
|
||||
}
|
||||
|
||||
// WithSuffix creates new RuntimeRecordsTable with assigned table suffix
|
||||
func (a RuntimeRecordsTable) WithSuffix(suffix string) *RuntimeRecordsTable {
|
||||
return newRuntimeRecordsTable(a.SchemaName(), a.TableName()+suffix, a.TableName())
|
||||
}
|
||||
|
||||
func newRuntimeRecordsTable(schemaName, tableName, alias string) *RuntimeRecordsTable {
|
||||
return &RuntimeRecordsTable{
|
||||
runtimeRecordsTable: newRuntimeRecordsTableImpl(schemaName, tableName, alias),
|
||||
EXCLUDED: newRuntimeRecordsTableImpl("", "excluded", ""),
|
||||
}
|
||||
}
|
||||
|
||||
func newRuntimeRecordsTableImpl(schemaName, tableName, alias string) runtimeRecordsTable {
|
||||
var (
|
||||
GameIDColumn = postgres.StringColumn("game_id")
|
||||
StatusColumn = postgres.StringColumn("status")
|
||||
CurrentContainerIDColumn = postgres.StringColumn("current_container_id")
|
||||
CurrentImageRefColumn = postgres.StringColumn("current_image_ref")
|
||||
EngineEndpointColumn = postgres.StringColumn("engine_endpoint")
|
||||
StatePathColumn = postgres.StringColumn("state_path")
|
||||
DockerNetworkColumn = postgres.StringColumn("docker_network")
|
||||
StartedAtColumn = postgres.TimestampzColumn("started_at")
|
||||
StoppedAtColumn = postgres.TimestampzColumn("stopped_at")
|
||||
RemovedAtColumn = postgres.TimestampzColumn("removed_at")
|
||||
LastOpAtColumn = postgres.TimestampzColumn("last_op_at")
|
||||
CreatedAtColumn = postgres.TimestampzColumn("created_at")
|
||||
allColumns = postgres.ColumnList{GameIDColumn, StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn}
|
||||
mutableColumns = postgres.ColumnList{StatusColumn, CurrentContainerIDColumn, CurrentImageRefColumn, EngineEndpointColumn, StatePathColumn, DockerNetworkColumn, StartedAtColumn, StoppedAtColumn, RemovedAtColumn, LastOpAtColumn, CreatedAtColumn}
|
||||
defaultColumns = postgres.ColumnList{}
|
||||
)
|
||||
|
||||
return runtimeRecordsTable{
|
||||
Table: postgres.NewTable(schemaName, tableName, alias, allColumns...),
|
||||
|
||||
//Columns
|
||||
GameID: GameIDColumn,
|
||||
Status: StatusColumn,
|
||||
CurrentContainerID: CurrentContainerIDColumn,
|
||||
CurrentImageRef: CurrentImageRefColumn,
|
||||
EngineEndpoint: EngineEndpointColumn,
|
||||
StatePath: StatePathColumn,
|
||||
DockerNetwork: DockerNetworkColumn,
|
||||
StartedAt: StartedAtColumn,
|
||||
StoppedAt: StoppedAtColumn,
|
||||
RemovedAt: RemovedAtColumn,
|
||||
LastOpAt: LastOpAtColumn,
|
||||
CreatedAt: CreatedAtColumn,
|
||||
|
||||
AllColumns: allColumns,
|
||||
MutableColumns: mutableColumns,
|
||||
DefaultColumns: defaultColumns,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
//
|
||||
// Code generated by go-jet DO NOT EDIT.
|
||||
//
|
||||
// WARNING: Changes to this file may cause incorrect behavior
|
||||
// and will be lost if the code is regenerated
|
||||
//
|
||||
|
||||
package table
|
||||
|
||||
// UseSchema sets a new schema name for all generated table SQL builder types. It is recommended to invoke
|
||||
// this method only once at the beginning of the program.
|
||||
func UseSchema(schema string) {
|
||||
GooseDbVersion = GooseDbVersion.FromSchema(schema)
|
||||
HealthSnapshots = HealthSnapshots.FromSchema(schema)
|
||||
OperationLog = OperationLog.FromSchema(schema)
|
||||
RuntimeRecords = RuntimeRecords.FromSchema(schema)
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
-- +goose Up
|
||||
-- Initial Runtime Manager PostgreSQL schema.
|
||||
--
|
||||
-- Three tables cover the durable surface of the service:
|
||||
-- * runtime_records — one row per game with the latest known runtime
|
||||
-- status and Docker container binding;
|
||||
-- * operation_log — append-only audit of every start/stop/restart/
|
||||
-- patch/cleanup/reconcile_* operation RTM performed;
|
||||
-- * health_snapshots — latest technical health observation per game.
|
||||
--
|
||||
-- Schema and the matching `rtmanagerservice` role are provisioned
|
||||
-- outside this script (in tests via cmd/jetgen/main.go::provisionRoleAndSchema;
|
||||
-- in production via an ops init script). This migration runs as the
|
||||
-- schema owner with `search_path=rtmanager` and only contains DDL for the
|
||||
-- service-owned tables and indexes. ARCHITECTURE.md §Database topology
|
||||
-- mandates that the per-service role's grants stay restricted to its own
|
||||
-- schema; consequently this file deliberately deviates from PLAN.md
|
||||
-- Stage 09's literal `CREATE SCHEMA IF NOT EXISTS rtmanager;` instruction.
|
||||
|
||||
-- runtime_records holds one durable record per game with the latest
|
||||
-- known runtime status and Docker container binding. The status enum
|
||||
-- (running | stopped | removed) is enforced by a CHECK so domain code
|
||||
-- can rely on it without reading every callsite. The (status, last_op_at)
|
||||
-- index drives the periodic container-cleanup worker that scans
|
||||
-- `status='stopped' AND last_op_at < now() - retention`.
|
||||
CREATE TABLE runtime_records (
|
||||
game_id text PRIMARY KEY,
|
||||
status text NOT NULL,
|
||||
current_container_id text,
|
||||
current_image_ref text,
|
||||
engine_endpoint text NOT NULL,
|
||||
state_path text NOT NULL,
|
||||
docker_network text NOT NULL,
|
||||
started_at timestamptz,
|
||||
stopped_at timestamptz,
|
||||
removed_at timestamptz,
|
||||
last_op_at timestamptz NOT NULL,
|
||||
created_at timestamptz NOT NULL,
|
||||
CONSTRAINT runtime_records_status_chk
|
||||
CHECK (status IN ('running', 'stopped', 'removed'))
|
||||
);
|
||||
|
||||
CREATE INDEX runtime_records_status_last_op_idx
|
||||
ON runtime_records (status, last_op_at);
|
||||
|
||||
-- operation_log is an append-only audit of every operation Runtime
|
||||
-- Manager performed against a game's runtime. The (game_id, started_at
|
||||
-- DESC) index drives audit reads from the GM/Admin REST surface;
|
||||
-- finished_at is nullable for in-flight rows even though Stage 13+
|
||||
-- always finalises the row in the same transaction. The op_kind /
|
||||
-- op_source / outcome enums are enforced by CHECK constraints to keep
|
||||
-- the audit schema honest without a separate Go validator.
|
||||
CREATE TABLE operation_log (
|
||||
id bigserial PRIMARY KEY,
|
||||
game_id text NOT NULL,
|
||||
op_kind text NOT NULL,
|
||||
op_source text NOT NULL,
|
||||
source_ref text NOT NULL DEFAULT '',
|
||||
image_ref text NOT NULL DEFAULT '',
|
||||
container_id text NOT NULL DEFAULT '',
|
||||
outcome text NOT NULL,
|
||||
error_code text NOT NULL DEFAULT '',
|
||||
error_message text NOT NULL DEFAULT '',
|
||||
started_at timestamptz NOT NULL,
|
||||
finished_at timestamptz,
|
||||
CONSTRAINT operation_log_op_kind_chk
|
||||
CHECK (op_kind IN (
|
||||
'start', 'stop', 'restart', 'patch',
|
||||
'cleanup_container', 'reconcile_adopt', 'reconcile_dispose'
|
||||
)),
|
||||
CONSTRAINT operation_log_op_source_chk
|
||||
CHECK (op_source IN (
|
||||
'lobby_stream', 'gm_rest', 'admin_rest',
|
||||
'auto_ttl', 'auto_reconcile'
|
||||
)),
|
||||
CONSTRAINT operation_log_outcome_chk
|
||||
CHECK (outcome IN ('success', 'failure'))
|
||||
);
|
||||
|
||||
CREATE INDEX operation_log_game_started_idx
|
||||
ON operation_log (game_id, started_at DESC);
|
||||
|
||||
-- health_snapshots stores the latest technical health observation per
|
||||
-- game. One row per game; later observations overwrite. The status enum
|
||||
-- mirrors the `event_type` vocabulary on `runtime:health_events`
|
||||
-- (collapsed to a flat status column for the latest-observation view).
|
||||
CREATE TABLE health_snapshots (
|
||||
game_id text PRIMARY KEY,
|
||||
container_id text NOT NULL DEFAULT '',
|
||||
status text NOT NULL,
|
||||
source text NOT NULL,
|
||||
details jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
observed_at timestamptz NOT NULL,
|
||||
CONSTRAINT health_snapshots_status_chk
|
||||
CHECK (status IN (
|
||||
'healthy', 'probe_failed', 'exited',
|
||||
'oom', 'inspect_unhealthy', 'container_disappeared'
|
||||
)),
|
||||
CONSTRAINT health_snapshots_source_chk
|
||||
CHECK (source IN ('docker_event', 'inspect', 'probe'))
|
||||
);
|
||||
|
||||
-- +goose Down
|
||||
DROP TABLE IF EXISTS health_snapshots;
|
||||
DROP TABLE IF EXISTS operation_log;
|
||||
DROP TABLE IF EXISTS runtime_records;
|
||||
@@ -0,0 +1,19 @@
|
||||
// Package migrations exposes the embedded goose migration files used by
|
||||
// Runtime Manager to provision its `rtmanager` schema in PostgreSQL.
|
||||
//
|
||||
// The embedded filesystem is consumed by `pkg/postgres.RunMigrations`
|
||||
// during rtmanager-service startup and by `cmd/jetgen` when regenerating
|
||||
// the `internal/adapters/postgres/jet/` code against a transient
|
||||
// PostgreSQL instance.
|
||||
package migrations
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed *.sql
|
||||
var fs embed.FS
|
||||
|
||||
// FS returns the embedded filesystem containing every numbered goose
|
||||
// migration shipped with Runtime Manager.
|
||||
func FS() embed.FS {
|
||||
return fs
|
||||
}
|
||||
@@ -0,0 +1,235 @@
|
||||
// Package operationlogstore implements the PostgreSQL-backed adapter for
|
||||
// `ports.OperationLogStore`.
|
||||
//
|
||||
// The package owns the on-disk shape of the `operation_log` table defined
|
||||
// in
|
||||
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
|
||||
// and translates the schema-agnostic `ports.OperationLogStore` interface
|
||||
// declared in `internal/ports/operationlogstore.go` into concrete
|
||||
// go-jet/v2 statements driven by the pgx driver.
|
||||
//
|
||||
// Append uses `INSERT ... RETURNING id` to surface the bigserial id back
|
||||
// to callers; ListByGame is index-driven by `operation_log_game_started_idx`.
|
||||
package operationlogstore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
|
||||
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
pg "github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
// Config configures one PostgreSQL-backed operation-log store instance.
|
||||
type Config struct {
|
||||
// DB stores the connection pool the store uses for every query.
|
||||
DB *sql.DB
|
||||
|
||||
// OperationTimeout bounds one round trip.
|
||||
OperationTimeout time.Duration
|
||||
}
|
||||
|
||||
// Store persists Runtime Manager operation-log entries in PostgreSQL.
|
||||
type Store struct {
|
||||
db *sql.DB
|
||||
operationTimeout time.Duration
|
||||
}
|
||||
|
||||
// New constructs one PostgreSQL-backed operation-log store from cfg.
|
||||
func New(cfg Config) (*Store, error) {
|
||||
if cfg.DB == nil {
|
||||
return nil, errors.New("new postgres operation log store: db must not be nil")
|
||||
}
|
||||
if cfg.OperationTimeout <= 0 {
|
||||
return nil, errors.New("new postgres operation log store: operation timeout must be positive")
|
||||
}
|
||||
return &Store{
|
||||
db: cfg.DB,
|
||||
operationTimeout: cfg.OperationTimeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// operationLogSelectColumns is the canonical SELECT list for the
|
||||
// operation_log table, matching scanEntry's column order.
|
||||
var operationLogSelectColumns = pg.ColumnList{
|
||||
pgtable.OperationLog.ID,
|
||||
pgtable.OperationLog.GameID,
|
||||
pgtable.OperationLog.OpKind,
|
||||
pgtable.OperationLog.OpSource,
|
||||
pgtable.OperationLog.SourceRef,
|
||||
pgtable.OperationLog.ImageRef,
|
||||
pgtable.OperationLog.ContainerID,
|
||||
pgtable.OperationLog.Outcome,
|
||||
pgtable.OperationLog.ErrorCode,
|
||||
pgtable.OperationLog.ErrorMessage,
|
||||
pgtable.OperationLog.StartedAt,
|
||||
pgtable.OperationLog.FinishedAt,
|
||||
}
|
||||
|
||||
// Append inserts entry into the operation log and returns the generated
|
||||
// bigserial id. entry is validated through operation.OperationEntry.Validate
|
||||
// before the SQL is issued.
|
||||
func (store *Store) Append(ctx context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return 0, errors.New("append operation log entry: nil store")
|
||||
}
|
||||
if err := entry.Validate(); err != nil {
|
||||
return 0, fmt.Errorf("append operation log entry: %w", err)
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "append operation log entry", store.operationTimeout)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pgtable.OperationLog.INSERT(
|
||||
pgtable.OperationLog.GameID,
|
||||
pgtable.OperationLog.OpKind,
|
||||
pgtable.OperationLog.OpSource,
|
||||
pgtable.OperationLog.SourceRef,
|
||||
pgtable.OperationLog.ImageRef,
|
||||
pgtable.OperationLog.ContainerID,
|
||||
pgtable.OperationLog.Outcome,
|
||||
pgtable.OperationLog.ErrorCode,
|
||||
pgtable.OperationLog.ErrorMessage,
|
||||
pgtable.OperationLog.StartedAt,
|
||||
pgtable.OperationLog.FinishedAt,
|
||||
).VALUES(
|
||||
entry.GameID,
|
||||
string(entry.OpKind),
|
||||
string(entry.OpSource),
|
||||
entry.SourceRef,
|
||||
entry.ImageRef,
|
||||
entry.ContainerID,
|
||||
string(entry.Outcome),
|
||||
entry.ErrorCode,
|
||||
entry.ErrorMessage,
|
||||
entry.StartedAt.UTC(),
|
||||
sqlx.NullableTimePtr(entry.FinishedAt),
|
||||
).RETURNING(pgtable.OperationLog.ID)
|
||||
|
||||
query, args := stmt.Sql()
|
||||
row := store.db.QueryRowContext(operationCtx, query, args...)
|
||||
var id int64
|
||||
if err := row.Scan(&id); err != nil {
|
||||
return 0, fmt.Errorf("append operation log entry: %w", err)
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// ListByGame returns the most recent entries for gameID, ordered by
|
||||
// started_at descending and capped by limit. The (game_id,
|
||||
// started_at DESC) index drives the read.
|
||||
func (store *Store) ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return nil, errors.New("list operation log entries by game: nil store")
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return nil, fmt.Errorf("list operation log entries by game: game id must not be empty")
|
||||
}
|
||||
if limit <= 0 {
|
||||
return nil, fmt.Errorf("list operation log entries by game: limit must be positive, got %d", limit)
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list operation log entries by game", store.operationTimeout)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pg.SELECT(operationLogSelectColumns).
|
||||
FROM(pgtable.OperationLog).
|
||||
WHERE(pgtable.OperationLog.GameID.EQ(pg.String(gameID))).
|
||||
ORDER_BY(pgtable.OperationLog.StartedAt.DESC(), pgtable.OperationLog.ID.DESC()).
|
||||
LIMIT(int64(limit))
|
||||
|
||||
query, args := stmt.Sql()
|
||||
rows, err := store.db.QueryContext(operationCtx, query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list operation log entries by game: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
entries := make([]operation.OperationEntry, 0)
|
||||
for rows.Next() {
|
||||
entry, err := scanEntry(rows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list operation log entries by game: scan: %w", err)
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("list operation log entries by game: %w", err)
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// rowScanner abstracts *sql.Row and *sql.Rows so scanEntry can be shared
|
||||
// across both single-row reads and iterated reads.
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
// scanEntry scans one operation_log row from rs.
|
||||
func scanEntry(rs rowScanner) (operation.OperationEntry, error) {
|
||||
var (
|
||||
id int64
|
||||
gameID string
|
||||
opKind string
|
||||
opSource string
|
||||
sourceRef string
|
||||
imageRef string
|
||||
containerID string
|
||||
outcome string
|
||||
errorCode string
|
||||
errorMessage string
|
||||
startedAt time.Time
|
||||
finishedAt sql.NullTime
|
||||
)
|
||||
if err := rs.Scan(
|
||||
&id,
|
||||
&gameID,
|
||||
&opKind,
|
||||
&opSource,
|
||||
&sourceRef,
|
||||
&imageRef,
|
||||
&containerID,
|
||||
&outcome,
|
||||
&errorCode,
|
||||
&errorMessage,
|
||||
&startedAt,
|
||||
&finishedAt,
|
||||
); err != nil {
|
||||
return operation.OperationEntry{}, err
|
||||
}
|
||||
return operation.OperationEntry{
|
||||
ID: id,
|
||||
GameID: gameID,
|
||||
OpKind: operation.OpKind(opKind),
|
||||
OpSource: operation.OpSource(opSource),
|
||||
SourceRef: sourceRef,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: containerID,
|
||||
Outcome: operation.Outcome(outcome),
|
||||
ErrorCode: errorCode,
|
||||
ErrorMessage: errorMessage,
|
||||
StartedAt: startedAt.UTC(),
|
||||
FinishedAt: sqlx.TimePtrFromNullable(finishedAt),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Ensure Store satisfies the ports.OperationLogStore interface at compile
|
||||
// time.
|
||||
var _ ports.OperationLogStore = (*Store)(nil)
|
||||
@@ -0,0 +1,207 @@
|
||||
package operationlogstore_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) { pgtest.RunMain(m) }
|
||||
|
||||
func newStore(t *testing.T) *operationlogstore.Store {
|
||||
t.Helper()
|
||||
pgtest.TruncateAll(t)
|
||||
store, err := operationlogstore.New(operationlogstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
OperationTimeout: pgtest.OperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return store
|
||||
}
|
||||
|
||||
func successStartEntry(gameID string, startedAt time.Time, sourceRef string) operation.OperationEntry {
|
||||
finishedAt := startedAt.Add(time.Second)
|
||||
return operation.OperationEntry{
|
||||
GameID: gameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: sourceRef,
|
||||
ImageRef: "galaxy/game:v1.2.3",
|
||||
ContainerID: "container-1",
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: startedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendReturnsPositiveIDs(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
id1, err := store.Append(ctx, successStartEntry("game-001", startedAt, "1700000000000-0"))
|
||||
require.NoError(t, err)
|
||||
assert.Greater(t, id1, int64(0))
|
||||
|
||||
id2, err := store.Append(ctx, successStartEntry("game-001", startedAt.Add(time.Minute), "1700000000001-0"))
|
||||
require.NoError(t, err)
|
||||
assert.Greater(t, id2, id1)
|
||||
}
|
||||
|
||||
func TestAppendValidatesEntry(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*operation.OperationEntry)
|
||||
}{
|
||||
{"empty game id", func(e *operation.OperationEntry) { e.GameID = "" }},
|
||||
{"unknown op kind", func(e *operation.OperationEntry) { e.OpKind = "exotic" }},
|
||||
{"unknown op source", func(e *operation.OperationEntry) { e.OpSource = "exotic" }},
|
||||
{"unknown outcome", func(e *operation.OperationEntry) { e.Outcome = "exotic" }},
|
||||
{"zero started at", func(e *operation.OperationEntry) { e.StartedAt = time.Time{} }},
|
||||
{"failure without error code", func(e *operation.OperationEntry) {
|
||||
e.Outcome = operation.OutcomeFailure
|
||||
e.ErrorCode = ""
|
||||
}},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
entry := successStartEntry("game-001",
|
||||
time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC), "ref")
|
||||
tt.mutate(&entry)
|
||||
_, err := store.Append(ctx, entry)
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestListByGameReturnsEntriesNewestFirst(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
for index := range 3 {
|
||||
_, err := store.Append(ctx, successStartEntry("game-001",
|
||||
base.Add(time.Duration(index)*time.Minute),
|
||||
"ref-game-001-"))
|
||||
require.NoError(t, err)
|
||||
}
|
||||
// Foreign-game entry must not appear in the list.
|
||||
_, err := store.Append(ctx, successStartEntry("game-other", base, "ref-other"))
|
||||
require.NoError(t, err)
|
||||
|
||||
entries, err := store.ListByGame(ctx, "game-001", 10)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 3)
|
||||
for index := range 2 {
|
||||
assert.True(t,
|
||||
!entries[index].StartedAt.Before(entries[index+1].StartedAt),
|
||||
"entries must be ordered started_at DESC; got %s before %s",
|
||||
entries[index].StartedAt, entries[index+1].StartedAt,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListByGameRespectsLimit(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
base := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
for index := range 5 {
|
||||
_, err := store.Append(ctx, successStartEntry("game-001",
|
||||
base.Add(time.Duration(index)*time.Minute), "ref"))
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
entries, err := store.ListByGame(ctx, "game-001", 2)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 2)
|
||||
}
|
||||
|
||||
func TestListByGameReturnsEmptyForUnknownGame(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
entries, err := store.ListByGame(ctx, "game-missing", 10)
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, entries)
|
||||
}
|
||||
|
||||
func TestListByGameRejectsInvalidArgs(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
_, err := store.ListByGame(ctx, "", 10)
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = store.ListByGame(ctx, "game-001", 0)
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = store.ListByGame(ctx, "game-001", -3)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestAppendRoundTripsAllFields(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
startedAt := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
finishedAt := startedAt.Add(2 * time.Second)
|
||||
original := operation.OperationEntry{
|
||||
GameID: "game-001",
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "request-7",
|
||||
ImageRef: "galaxy/game:v2.0.0",
|
||||
ContainerID: "container-X",
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: "container_start_failed",
|
||||
ErrorMessage: "stop deadline exceeded",
|
||||
StartedAt: startedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
}
|
||||
id, err := store.Append(ctx, original)
|
||||
require.NoError(t, err)
|
||||
|
||||
entries, err := store.ListByGame(ctx, "game-001", 10)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, entries, 1)
|
||||
|
||||
got := entries[0]
|
||||
assert.Equal(t, id, got.ID)
|
||||
assert.Equal(t, original.GameID, got.GameID)
|
||||
assert.Equal(t, original.OpKind, got.OpKind)
|
||||
assert.Equal(t, original.OpSource, got.OpSource)
|
||||
assert.Equal(t, original.SourceRef, got.SourceRef)
|
||||
assert.Equal(t, original.ImageRef, got.ImageRef)
|
||||
assert.Equal(t, original.ContainerID, got.ContainerID)
|
||||
assert.Equal(t, original.Outcome, got.Outcome)
|
||||
assert.Equal(t, original.ErrorCode, got.ErrorCode)
|
||||
assert.Equal(t, original.ErrorMessage, got.ErrorMessage)
|
||||
assert.True(t, original.StartedAt.Equal(got.StartedAt))
|
||||
require.NotNil(t, got.FinishedAt)
|
||||
assert.True(t, original.FinishedAt.Equal(*got.FinishedAt))
|
||||
assert.Equal(t, time.UTC, got.StartedAt.Location())
|
||||
assert.Equal(t, time.UTC, got.FinishedAt.Location())
|
||||
}
|
||||
|
||||
func TestNewRejectsNilDB(t *testing.T) {
|
||||
_, err := operationlogstore.New(operationlogstore.Config{OperationTimeout: time.Second})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
|
||||
_, err := operationlogstore.New(operationlogstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,500 @@
|
||||
// Package runtimerecordstore implements the PostgreSQL-backed adapter for
|
||||
// `ports.RuntimeRecordStore`.
|
||||
//
|
||||
// The package owns the on-disk shape of the `runtime_records` table
|
||||
// defined in
|
||||
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`
|
||||
// and translates the schema-agnostic `ports.RuntimeRecordStore` interface
|
||||
// declared in `internal/ports/runtimerecordstore.go` into concrete
|
||||
// go-jet/v2 statements driven by the pgx driver.
|
||||
//
|
||||
// Lifecycle transitions (UpdateStatus) use compare-and-swap on
|
||||
// `(status, current_container_id)` rather than holding a SELECT ... FOR
|
||||
// UPDATE lock across the caller's logic, mirroring the pattern used by
|
||||
// `lobby/internal/adapters/postgres/gamestore`.
|
||||
package runtimerecordstore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/sqlx"
|
||||
pgtable "galaxy/rtmanager/internal/adapters/postgres/jet/rtmanager/table"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
pg "github.com/go-jet/jet/v2/postgres"
|
||||
)
|
||||
|
||||
// Config configures one PostgreSQL-backed runtime-record store instance.
|
||||
// The store does not own the underlying *sql.DB lifecycle: the caller
|
||||
// (typically the service runtime) opens, instruments, migrates, and
|
||||
// closes the pool.
|
||||
type Config struct {
|
||||
// DB stores the connection pool the store uses for every query.
|
||||
DB *sql.DB
|
||||
|
||||
// OperationTimeout bounds one round trip. The store creates a
|
||||
// derived context for each operation so callers cannot starve the
|
||||
// pool with an unbounded ctx.
|
||||
OperationTimeout time.Duration
|
||||
}
|
||||
|
||||
// Store persists Runtime Manager runtime records in PostgreSQL.
|
||||
type Store struct {
|
||||
db *sql.DB
|
||||
operationTimeout time.Duration
|
||||
}
|
||||
|
||||
// New constructs one PostgreSQL-backed runtime-record store from cfg.
|
||||
func New(cfg Config) (*Store, error) {
|
||||
if cfg.DB == nil {
|
||||
return nil, errors.New("new postgres runtime record store: db must not be nil")
|
||||
}
|
||||
if cfg.OperationTimeout <= 0 {
|
||||
return nil, errors.New("new postgres runtime record store: operation timeout must be positive")
|
||||
}
|
||||
return &Store{
|
||||
db: cfg.DB,
|
||||
operationTimeout: cfg.OperationTimeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// runtimeSelectColumns is the canonical SELECT list for the runtime_records
|
||||
// table, matching scanRecord's column order.
|
||||
var runtimeSelectColumns = pg.ColumnList{
|
||||
pgtable.RuntimeRecords.GameID,
|
||||
pgtable.RuntimeRecords.Status,
|
||||
pgtable.RuntimeRecords.CurrentContainerID,
|
||||
pgtable.RuntimeRecords.CurrentImageRef,
|
||||
pgtable.RuntimeRecords.EngineEndpoint,
|
||||
pgtable.RuntimeRecords.StatePath,
|
||||
pgtable.RuntimeRecords.DockerNetwork,
|
||||
pgtable.RuntimeRecords.StartedAt,
|
||||
pgtable.RuntimeRecords.StoppedAt,
|
||||
pgtable.RuntimeRecords.RemovedAt,
|
||||
pgtable.RuntimeRecords.LastOpAt,
|
||||
pgtable.RuntimeRecords.CreatedAt,
|
||||
}
|
||||
|
||||
// Get returns the record identified by gameID. It returns
|
||||
// runtime.ErrNotFound when no record exists.
|
||||
func (store *Store) Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return runtime.RuntimeRecord{}, errors.New("get runtime record: nil store")
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: game id must not be empty")
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "get runtime record", store.operationTimeout)
|
||||
if err != nil {
|
||||
return runtime.RuntimeRecord{}, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pg.SELECT(runtimeSelectColumns).
|
||||
FROM(pgtable.RuntimeRecords).
|
||||
WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID)))
|
||||
|
||||
query, args := stmt.Sql()
|
||||
row := store.db.QueryRowContext(operationCtx, query, args...)
|
||||
record, err := scanRecord(row)
|
||||
if sqlx.IsNoRows(err) {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
if err != nil {
|
||||
return runtime.RuntimeRecord{}, fmt.Errorf("get runtime record: %w", err)
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
// Upsert inserts record when no row exists for record.GameID and
|
||||
// otherwise overwrites every mutable column verbatim. created_at is
|
||||
// preserved across upserts so the "first time RTM saw the game"
|
||||
// timestamp stays stable.
|
||||
func (store *Store) Upsert(ctx context.Context, record runtime.RuntimeRecord) error {
|
||||
if store == nil || store.db == nil {
|
||||
return errors.New("upsert runtime record: nil store")
|
||||
}
|
||||
if err := record.Validate(); err != nil {
|
||||
return fmt.Errorf("upsert runtime record: %w", err)
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "upsert runtime record", store.operationTimeout)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pgtable.RuntimeRecords.INSERT(
|
||||
pgtable.RuntimeRecords.GameID,
|
||||
pgtable.RuntimeRecords.Status,
|
||||
pgtable.RuntimeRecords.CurrentContainerID,
|
||||
pgtable.RuntimeRecords.CurrentImageRef,
|
||||
pgtable.RuntimeRecords.EngineEndpoint,
|
||||
pgtable.RuntimeRecords.StatePath,
|
||||
pgtable.RuntimeRecords.DockerNetwork,
|
||||
pgtable.RuntimeRecords.StartedAt,
|
||||
pgtable.RuntimeRecords.StoppedAt,
|
||||
pgtable.RuntimeRecords.RemovedAt,
|
||||
pgtable.RuntimeRecords.LastOpAt,
|
||||
pgtable.RuntimeRecords.CreatedAt,
|
||||
).VALUES(
|
||||
record.GameID,
|
||||
string(record.Status),
|
||||
sqlx.NullableString(record.CurrentContainerID),
|
||||
sqlx.NullableString(record.CurrentImageRef),
|
||||
record.EngineEndpoint,
|
||||
record.StatePath,
|
||||
record.DockerNetwork,
|
||||
sqlx.NullableTimePtr(record.StartedAt),
|
||||
sqlx.NullableTimePtr(record.StoppedAt),
|
||||
sqlx.NullableTimePtr(record.RemovedAt),
|
||||
record.LastOpAt.UTC(),
|
||||
record.CreatedAt.UTC(),
|
||||
).ON_CONFLICT(pgtable.RuntimeRecords.GameID).DO_UPDATE(
|
||||
pg.SET(
|
||||
pgtable.RuntimeRecords.Status.SET(pgtable.RuntimeRecords.EXCLUDED.Status),
|
||||
pgtable.RuntimeRecords.CurrentContainerID.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentContainerID),
|
||||
pgtable.RuntimeRecords.CurrentImageRef.SET(pgtable.RuntimeRecords.EXCLUDED.CurrentImageRef),
|
||||
pgtable.RuntimeRecords.EngineEndpoint.SET(pgtable.RuntimeRecords.EXCLUDED.EngineEndpoint),
|
||||
pgtable.RuntimeRecords.StatePath.SET(pgtable.RuntimeRecords.EXCLUDED.StatePath),
|
||||
pgtable.RuntimeRecords.DockerNetwork.SET(pgtable.RuntimeRecords.EXCLUDED.DockerNetwork),
|
||||
pgtable.RuntimeRecords.StartedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StartedAt),
|
||||
pgtable.RuntimeRecords.StoppedAt.SET(pgtable.RuntimeRecords.EXCLUDED.StoppedAt),
|
||||
pgtable.RuntimeRecords.RemovedAt.SET(pgtable.RuntimeRecords.EXCLUDED.RemovedAt),
|
||||
pgtable.RuntimeRecords.LastOpAt.SET(pgtable.RuntimeRecords.EXCLUDED.LastOpAt),
|
||||
),
|
||||
)
|
||||
|
||||
query, args := stmt.Sql()
|
||||
if _, err := store.db.ExecContext(operationCtx, query, args...); err != nil {
|
||||
return fmt.Errorf("upsert runtime record: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateStatus applies one status transition with a compare-and-swap
|
||||
// guard on (status, current_container_id). Validate is invoked before
|
||||
// any SQL touch.
|
||||
func (store *Store) UpdateStatus(ctx context.Context, input ports.UpdateStatusInput) error {
|
||||
if store == nil || store.db == nil {
|
||||
return errors.New("update runtime status: nil store")
|
||||
}
|
||||
if err := input.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "update runtime status", store.operationTimeout)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
now := input.Now.UTC()
|
||||
stmt, err := buildUpdateStatusStatement(input, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
query, args := stmt.Sql()
|
||||
result, err := store.db.ExecContext(operationCtx, query, args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update runtime status: %w", err)
|
||||
}
|
||||
affected, err := result.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("update runtime status: rows affected: %w", err)
|
||||
}
|
||||
if affected == 0 {
|
||||
return store.classifyMissingUpdate(operationCtx, input.GameID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// classifyMissingUpdate distinguishes ErrNotFound from ErrConflict after
|
||||
// an UPDATE that affected zero rows. A row that is absent yields
|
||||
// ErrNotFound; a row whose status or container_id does not match the
|
||||
// CAS predicate yields ErrConflict.
|
||||
func (store *Store) classifyMissingUpdate(ctx context.Context, gameID string) error {
|
||||
probe := pg.SELECT(pgtable.RuntimeRecords.Status).
|
||||
FROM(pgtable.RuntimeRecords).
|
||||
WHERE(pgtable.RuntimeRecords.GameID.EQ(pg.String(gameID)))
|
||||
probeQuery, probeArgs := probe.Sql()
|
||||
|
||||
var current string
|
||||
row := store.db.QueryRowContext(ctx, probeQuery, probeArgs...)
|
||||
if err := row.Scan(¤t); err != nil {
|
||||
if sqlx.IsNoRows(err) {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
return fmt.Errorf("update runtime status: probe: %w", err)
|
||||
}
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
|
||||
// buildUpdateStatusStatement assembles the UPDATE statement applied for
|
||||
// one runtime-status transition.
|
||||
//
|
||||
// status, last_op_at are always updated. The remaining columns are
|
||||
// driven by the destination:
|
||||
//
|
||||
// - StatusStopped: stopped_at is captured at Now.
|
||||
// - StatusRemoved: removed_at is captured at Now and current_container_id
|
||||
// is NULLed (the container is gone; the prior id remains observable
|
||||
// through operation_log).
|
||||
// - StatusRunning: only status + last_op_at change. Fresh started_at
|
||||
// and current_container_id are installed via Upsert before any
|
||||
// stopped → running transition reaches this path; the path exists
|
||||
// so runtime.AllowedTransitions stays one-to-one with the adapter
|
||||
// capability matrix even though v1 services use Upsert for this
|
||||
// case.
|
||||
func buildUpdateStatusStatement(input ports.UpdateStatusInput, now time.Time) (pg.UpdateStatement, error) {
|
||||
statusValue := pg.String(string(input.To))
|
||||
nowValue := pg.TimestampzT(now)
|
||||
|
||||
var stmt pg.UpdateStatement
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stmt = pgtable.RuntimeRecords.UPDATE(
|
||||
pgtable.RuntimeRecords.Status,
|
||||
pgtable.RuntimeRecords.LastOpAt,
|
||||
pgtable.RuntimeRecords.StoppedAt,
|
||||
).SET(
|
||||
statusValue,
|
||||
nowValue,
|
||||
nowValue,
|
||||
)
|
||||
case runtime.StatusRemoved:
|
||||
stmt = pgtable.RuntimeRecords.UPDATE(
|
||||
pgtable.RuntimeRecords.Status,
|
||||
pgtable.RuntimeRecords.LastOpAt,
|
||||
pgtable.RuntimeRecords.RemovedAt,
|
||||
pgtable.RuntimeRecords.CurrentContainerID,
|
||||
).SET(
|
||||
statusValue,
|
||||
nowValue,
|
||||
nowValue,
|
||||
pg.NULL,
|
||||
)
|
||||
case runtime.StatusRunning:
|
||||
stmt = pgtable.RuntimeRecords.UPDATE(
|
||||
pgtable.RuntimeRecords.Status,
|
||||
pgtable.RuntimeRecords.LastOpAt,
|
||||
).SET(
|
||||
statusValue,
|
||||
nowValue,
|
||||
)
|
||||
default:
|
||||
return nil, fmt.Errorf("update runtime status: destination status %q is unsupported", input.To)
|
||||
}
|
||||
|
||||
whereExpr := pg.AND(
|
||||
pgtable.RuntimeRecords.GameID.EQ(pg.String(input.GameID)),
|
||||
pgtable.RuntimeRecords.Status.EQ(pg.String(string(input.ExpectedFrom))),
|
||||
)
|
||||
if input.ExpectedContainerID != "" {
|
||||
whereExpr = pg.AND(
|
||||
whereExpr,
|
||||
pgtable.RuntimeRecords.CurrentContainerID.EQ(pg.String(input.ExpectedContainerID)),
|
||||
)
|
||||
}
|
||||
return stmt.WHERE(whereExpr), nil
|
||||
}
|
||||
|
||||
// ListByStatus returns every record currently indexed under status.
|
||||
// Ordering is last_op_at DESC, game_id ASC — the direction the
|
||||
// `runtime_records_status_last_op_idx` index is built in.
|
||||
func (store *Store) ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return nil, errors.New("list runtime records by status: nil store")
|
||||
}
|
||||
if !status.IsKnown() {
|
||||
return nil, fmt.Errorf("list runtime records by status: status %q is unsupported", status)
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records by status", store.operationTimeout)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pg.SELECT(runtimeSelectColumns).
|
||||
FROM(pgtable.RuntimeRecords).
|
||||
WHERE(pgtable.RuntimeRecords.Status.EQ(pg.String(string(status)))).
|
||||
ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC())
|
||||
|
||||
query, args := stmt.Sql()
|
||||
rows, err := store.db.QueryContext(operationCtx, query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list runtime records by status: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
records := make([]runtime.RuntimeRecord, 0)
|
||||
for rows.Next() {
|
||||
record, err := scanRecord(rows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list runtime records by status: scan: %w", err)
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("list runtime records by status: %w", err)
|
||||
}
|
||||
if len(records) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return records, nil
|
||||
}
|
||||
|
||||
// List returns every runtime record currently stored. Ordering matches
|
||||
// ListByStatus — last_op_at DESC, game_id ASC — so the REST list
|
||||
// endpoint sees the freshest activity first.
|
||||
func (store *Store) List(ctx context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return nil, errors.New("list runtime records: nil store")
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "list runtime records", store.operationTimeout)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
stmt := pg.SELECT(runtimeSelectColumns).
|
||||
FROM(pgtable.RuntimeRecords).
|
||||
ORDER_BY(pgtable.RuntimeRecords.LastOpAt.DESC(), pgtable.RuntimeRecords.GameID.ASC())
|
||||
|
||||
query, args := stmt.Sql()
|
||||
rows, err := store.db.QueryContext(operationCtx, query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list runtime records: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
records := make([]runtime.RuntimeRecord, 0)
|
||||
for rows.Next() {
|
||||
record, err := scanRecord(rows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list runtime records: scan: %w", err)
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("list runtime records: %w", err)
|
||||
}
|
||||
if len(records) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return records, nil
|
||||
}
|
||||
|
||||
// CountByStatus returns the number of records indexed under each status.
|
||||
// Statuses with zero records are present in the result with a zero
|
||||
// count so callers (e.g. the telemetry gauge) can publish a stable
|
||||
// label set on every reading.
|
||||
func (store *Store) CountByStatus(ctx context.Context) (map[runtime.Status]int, error) {
|
||||
if store == nil || store.db == nil {
|
||||
return nil, errors.New("count runtime records by status: nil store")
|
||||
}
|
||||
|
||||
operationCtx, cancel, err := sqlx.WithTimeout(ctx, "count runtime records by status", store.operationTimeout)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
countAlias := pg.COUNT(pg.STAR).AS("count")
|
||||
stmt := pg.SELECT(pgtable.RuntimeRecords.Status, countAlias).
|
||||
FROM(pgtable.RuntimeRecords).
|
||||
GROUP_BY(pgtable.RuntimeRecords.Status)
|
||||
|
||||
query, args := stmt.Sql()
|
||||
rows, err := store.db.QueryContext(operationCtx, query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("count runtime records by status: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
counts := make(map[runtime.Status]int, len(runtime.AllStatuses()))
|
||||
for _, status := range runtime.AllStatuses() {
|
||||
counts[status] = 0
|
||||
}
|
||||
for rows.Next() {
|
||||
var status string
|
||||
var count int
|
||||
if err := rows.Scan(&status, &count); err != nil {
|
||||
return nil, fmt.Errorf("count runtime records by status: scan: %w", err)
|
||||
}
|
||||
counts[runtime.Status(status)] = count
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("count runtime records by status: %w", err)
|
||||
}
|
||||
return counts, nil
|
||||
}
|
||||
|
||||
// rowScanner abstracts *sql.Row and *sql.Rows so scanRecord can be shared
|
||||
// across both single-row reads and iterated reads.
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
// scanRecord scans one runtime_records row from rs. Returns sql.ErrNoRows
|
||||
// verbatim so callers can distinguish "no row" from a hard error.
|
||||
func scanRecord(rs rowScanner) (runtime.RuntimeRecord, error) {
|
||||
var (
|
||||
gameID string
|
||||
status string
|
||||
currentContainerID sql.NullString
|
||||
currentImageRef sql.NullString
|
||||
engineEndpoint string
|
||||
statePath string
|
||||
dockerNetwork string
|
||||
startedAt sql.NullTime
|
||||
stoppedAt sql.NullTime
|
||||
removedAt sql.NullTime
|
||||
lastOpAt time.Time
|
||||
createdAt time.Time
|
||||
)
|
||||
if err := rs.Scan(
|
||||
&gameID,
|
||||
&status,
|
||||
¤tContainerID,
|
||||
¤tImageRef,
|
||||
&engineEndpoint,
|
||||
&statePath,
|
||||
&dockerNetwork,
|
||||
&startedAt,
|
||||
&stoppedAt,
|
||||
&removedAt,
|
||||
&lastOpAt,
|
||||
&createdAt,
|
||||
); err != nil {
|
||||
return runtime.RuntimeRecord{}, err
|
||||
}
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.Status(status),
|
||||
CurrentContainerID: sqlx.StringFromNullable(currentContainerID),
|
||||
CurrentImageRef: sqlx.StringFromNullable(currentImageRef),
|
||||
EngineEndpoint: engineEndpoint,
|
||||
StatePath: statePath,
|
||||
DockerNetwork: dockerNetwork,
|
||||
StartedAt: sqlx.TimePtrFromNullable(startedAt),
|
||||
StoppedAt: sqlx.TimePtrFromNullable(stoppedAt),
|
||||
RemovedAt: sqlx.TimePtrFromNullable(removedAt),
|
||||
LastOpAt: lastOpAt.UTC(),
|
||||
CreatedAt: createdAt.UTC(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Ensure Store satisfies the ports.RuntimeRecordStore interface at
|
||||
// compile time.
|
||||
var _ ports.RuntimeRecordStore = (*Store)(nil)
|
||||
@@ -0,0 +1,420 @@
|
||||
package runtimerecordstore_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/postgres/internal/pgtest"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) { pgtest.RunMain(m) }
|
||||
|
||||
func newStore(t *testing.T) *runtimerecordstore.Store {
|
||||
t.Helper()
|
||||
pgtest.TruncateAll(t)
|
||||
store, err := runtimerecordstore.New(runtimerecordstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
OperationTimeout: pgtest.OperationTimeout,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return store
|
||||
}
|
||||
|
||||
func runningRecord(t *testing.T, gameID, containerID, imageRef string) runtime.RuntimeRecord {
|
||||
t.Helper()
|
||||
now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
started := now
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: gameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: containerID,
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: "http://galaxy-game-" + gameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/games/" + gameID,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &started,
|
||||
LastOpAt: now,
|
||||
CreatedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpsertAndGetRoundTrip(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
got, err := store.Get(ctx, record.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, record.GameID, got.GameID)
|
||||
assert.Equal(t, record.Status, got.Status)
|
||||
assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID)
|
||||
assert.Equal(t, record.CurrentImageRef, got.CurrentImageRef)
|
||||
assert.Equal(t, record.EngineEndpoint, got.EngineEndpoint)
|
||||
assert.Equal(t, record.StatePath, got.StatePath)
|
||||
assert.Equal(t, record.DockerNetwork, got.DockerNetwork)
|
||||
require.NotNil(t, got.StartedAt)
|
||||
assert.True(t, record.StartedAt.Equal(*got.StartedAt))
|
||||
assert.Equal(t, time.UTC, got.StartedAt.Location())
|
||||
assert.Equal(t, time.UTC, got.LastOpAt.Location())
|
||||
assert.Equal(t, time.UTC, got.CreatedAt.Location())
|
||||
assert.Nil(t, got.StoppedAt)
|
||||
assert.Nil(t, got.RemovedAt)
|
||||
}
|
||||
|
||||
func TestGetReturnsNotFound(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
_, err := store.Get(ctx, "game-missing")
|
||||
require.ErrorIs(t, err, runtime.ErrNotFound)
|
||||
}
|
||||
|
||||
func TestUpsertOverwritesMutableColumnsPreservesCreatedAt(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
original := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, original))
|
||||
|
||||
updated := original
|
||||
updated.CurrentContainerID = "container-2"
|
||||
updated.CurrentImageRef = "galaxy/game:v1.2.4"
|
||||
newStarted := original.LastOpAt.Add(time.Minute)
|
||||
updated.StartedAt = &newStarted
|
||||
updated.LastOpAt = newStarted
|
||||
// Fresh CreatedAt simulates a caller passing "now"; the store must
|
||||
// preserve the original CreatedAt value on conflict.
|
||||
updated.CreatedAt = newStarted
|
||||
|
||||
require.NoError(t, store.Upsert(ctx, updated))
|
||||
|
||||
got, err := store.Get(ctx, original.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "container-2", got.CurrentContainerID)
|
||||
assert.Equal(t, "galaxy/game:v1.2.4", got.CurrentImageRef)
|
||||
assert.True(t, got.LastOpAt.Equal(newStarted))
|
||||
assert.True(t, got.CreatedAt.Equal(original.CreatedAt),
|
||||
"created_at must be preserved across upserts: got %s, want %s",
|
||||
got.CreatedAt, original.CreatedAt)
|
||||
}
|
||||
|
||||
func TestUpdateStatusRunningToStopped(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
now := record.LastOpAt.Add(2 * time.Minute)
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: record.CurrentContainerID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: now,
|
||||
}))
|
||||
|
||||
got, err := store.Get(ctx, record.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, runtime.StatusStopped, got.Status)
|
||||
require.NotNil(t, got.StoppedAt)
|
||||
assert.True(t, now.Equal(*got.StoppedAt))
|
||||
assert.True(t, now.Equal(got.LastOpAt))
|
||||
// container id is preserved on stop; cleanup later NULLs it.
|
||||
assert.Equal(t, record.CurrentContainerID, got.CurrentContainerID)
|
||||
}
|
||||
|
||||
func TestUpdateStatusRunningToRemovedClearsContainerID(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
now := record.LastOpAt.Add(time.Minute)
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: now,
|
||||
}))
|
||||
|
||||
got, err := store.Get(ctx, record.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, runtime.StatusRemoved, got.Status)
|
||||
require.NotNil(t, got.RemovedAt)
|
||||
assert.True(t, now.Equal(*got.RemovedAt))
|
||||
assert.True(t, now.Equal(got.LastOpAt))
|
||||
assert.Empty(t, got.CurrentContainerID, "current_container_id must be NULL after removal")
|
||||
}
|
||||
|
||||
func TestUpdateStatusStoppedToRemoved(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
stopAt := record.LastOpAt.Add(time.Minute)
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
Now: stopAt,
|
||||
}))
|
||||
|
||||
removeAt := stopAt.Add(time.Hour)
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusStopped,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: removeAt,
|
||||
}))
|
||||
|
||||
got, err := store.Get(ctx, record.GameID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, runtime.StatusRemoved, got.Status)
|
||||
require.NotNil(t, got.RemovedAt)
|
||||
assert.True(t, removeAt.Equal(*got.RemovedAt))
|
||||
assert.True(t, removeAt.Equal(got.LastOpAt))
|
||||
require.NotNil(t, got.StoppedAt, "stopped_at must remain populated through removal")
|
||||
assert.True(t, stopAt.Equal(*got.StoppedAt))
|
||||
assert.Empty(t, got.CurrentContainerID)
|
||||
}
|
||||
|
||||
func TestUpdateStatusReturnsConflictOnFromMismatch(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusStopped, // wrong
|
||||
To: runtime.StatusRemoved,
|
||||
Now: record.LastOpAt.Add(time.Minute),
|
||||
})
|
||||
require.ErrorIs(t, err, runtime.ErrConflict)
|
||||
}
|
||||
|
||||
func TestUpdateStatusReturnsConflictOnContainerIDMismatch(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: "container-other",
|
||||
To: runtime.StatusStopped,
|
||||
Now: record.LastOpAt.Add(time.Minute),
|
||||
})
|
||||
require.ErrorIs(t, err, runtime.ErrConflict)
|
||||
}
|
||||
|
||||
func TestUpdateStatusReturnsNotFoundForMissing(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: "game-missing",
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
Now: time.Now().UTC(),
|
||||
})
|
||||
require.ErrorIs(t, err, runtime.ErrNotFound)
|
||||
}
|
||||
|
||||
func TestUpdateStatusValidatesInputBeforeStore(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
err := store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: "game-001",
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
// Now intentionally zero — validation must reject.
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
// TestUpdateStatusConcurrentCAS asserts the CAS guard: when two callers
|
||||
// race to apply the running → stopped transition on the same row,
|
||||
// exactly one wins (returns nil) and the other observes
|
||||
// runtime.ErrConflict.
|
||||
func TestUpdateStatusConcurrentCAS(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
record := runningRecord(t, "game-001", "container-1", "galaxy/game:v1.2.3")
|
||||
require.NoError(t, store.Upsert(ctx, record))
|
||||
|
||||
const concurrency = 8
|
||||
results := make([]error, concurrency)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(concurrency)
|
||||
for index := range concurrency {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results[index] = store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: record.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: record.CurrentContainerID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: record.LastOpAt.Add(time.Duration(index+1) * time.Second),
|
||||
})
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
wins, conflicts := 0, 0
|
||||
for _, err := range results {
|
||||
switch {
|
||||
case err == nil:
|
||||
wins++
|
||||
case errors.Is(err, runtime.ErrConflict):
|
||||
conflicts++
|
||||
default:
|
||||
t.Errorf("unexpected error from concurrent UpdateStatus: %v", err)
|
||||
}
|
||||
}
|
||||
assert.Equal(t, 1, wins, "exactly one caller must win the CAS race")
|
||||
assert.Equal(t, concurrency-1, conflicts, "the rest must observe runtime.ErrConflict")
|
||||
}
|
||||
|
||||
func TestListByStatusReturnsExpectedRecords(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3")
|
||||
b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3")
|
||||
c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3")
|
||||
for _, r := range []runtime.RuntimeRecord{a, b, c} {
|
||||
require.NoError(t, store.Upsert(ctx, r))
|
||||
}
|
||||
|
||||
stopAt := a.LastOpAt.Add(time.Minute)
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: b.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
Now: stopAt,
|
||||
}))
|
||||
|
||||
running, err := store.ListByStatus(ctx, runtime.StatusRunning)
|
||||
require.NoError(t, err)
|
||||
gotIDs := map[string]struct{}{}
|
||||
for _, r := range running {
|
||||
gotIDs[r.GameID] = struct{}{}
|
||||
}
|
||||
assert.Contains(t, gotIDs, a.GameID)
|
||||
assert.Contains(t, gotIDs, c.GameID)
|
||||
assert.NotContains(t, gotIDs, b.GameID)
|
||||
|
||||
stopped, err := store.ListByStatus(ctx, runtime.StatusStopped)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, stopped, 1)
|
||||
assert.Equal(t, b.GameID, stopped[0].GameID)
|
||||
}
|
||||
|
||||
func TestListByStatusRejectsUnknown(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
_, err := store.ListByStatus(ctx, runtime.Status("exotic"))
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestListReturnsEveryStatus(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
a := runningRecord(t, "game-aaa", "container-a", "galaxy/game:v1.2.3")
|
||||
b := runningRecord(t, "game-bbb", "container-b", "galaxy/game:v1.2.3")
|
||||
c := runningRecord(t, "game-ccc", "container-c", "galaxy/game:v1.2.3")
|
||||
for _, r := range []runtime.RuntimeRecord{a, b, c} {
|
||||
require.NoError(t, store.Upsert(ctx, r))
|
||||
}
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: b.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
Now: b.LastOpAt.Add(time.Minute),
|
||||
}))
|
||||
|
||||
all, err := store.List(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, all, 3)
|
||||
|
||||
gotIDs := map[string]runtime.Status{}
|
||||
for _, r := range all {
|
||||
gotIDs[r.GameID] = r.Status
|
||||
}
|
||||
assert.Equal(t, runtime.StatusRunning, gotIDs[a.GameID])
|
||||
assert.Equal(t, runtime.StatusStopped, gotIDs[b.GameID])
|
||||
assert.Equal(t, runtime.StatusRunning, gotIDs[c.GameID])
|
||||
}
|
||||
|
||||
func TestListReturnsNilWhenEmpty(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
all, err := store.List(ctx)
|
||||
require.NoError(t, err)
|
||||
assert.Nil(t, all)
|
||||
}
|
||||
|
||||
func TestCountByStatusReturnsAllBuckets(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := newStore(t)
|
||||
|
||||
a := runningRecord(t, "game-1", "container-1", "galaxy/game:v1.2.3")
|
||||
b := runningRecord(t, "game-2", "container-2", "galaxy/game:v1.2.3")
|
||||
c := runningRecord(t, "game-3", "container-3", "galaxy/game:v1.2.3")
|
||||
for _, r := range []runtime.RuntimeRecord{a, b, c} {
|
||||
require.NoError(t, store.Upsert(ctx, r))
|
||||
}
|
||||
require.NoError(t, store.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: b.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
To: runtime.StatusStopped,
|
||||
Now: b.LastOpAt.Add(time.Minute),
|
||||
}))
|
||||
|
||||
counts, err := store.CountByStatus(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, status := range runtime.AllStatuses() {
|
||||
_, ok := counts[status]
|
||||
assert.True(t, ok, "status %q must appear in counts even when zero", status)
|
||||
}
|
||||
assert.Equal(t, 2, counts[runtime.StatusRunning])
|
||||
assert.Equal(t, 1, counts[runtime.StatusStopped])
|
||||
assert.Equal(t, 0, counts[runtime.StatusRemoved])
|
||||
}
|
||||
|
||||
func TestNewRejectsNilDB(t *testing.T) {
|
||||
_, err := runtimerecordstore.New(runtimerecordstore.Config{OperationTimeout: time.Second})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestNewRejectsNonPositiveTimeout(t *testing.T) {
|
||||
_, err := runtimerecordstore.New(runtimerecordstore.Config{
|
||||
DB: pgtest.Ensure(t).Pool(),
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
// Package gamelease implements the Redis-backed adapter for
|
||||
// `ports.GameLeaseStore`.
|
||||
//
|
||||
// The lease guards every lifecycle operation Runtime Manager runs
|
||||
// against one game (start, stop, restart, patch, cleanup, plus the
|
||||
// reconciler's drift mutations). Acquisition uses `SET NX PX <ttl>`
|
||||
// with a random caller token; release runs a Lua compare-and-delete
|
||||
// so a holder that lost the lease through TTL expiry cannot wipe
|
||||
// another caller's claim.
|
||||
package gamelease
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/redisstate"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// releaseScript removes the per-game lease only when the supplied token
|
||||
// still owns it. Compare-and-delete prevents a TTL-expired holder from
|
||||
// clearing another caller's claim.
|
||||
var releaseScript = redis.NewScript(`
|
||||
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||
return redis.call("DEL", KEYS[1])
|
||||
end
|
||||
return 0
|
||||
`)
|
||||
|
||||
// Config configures one Redis-backed game lease store instance. The
|
||||
// store does not own the redis client lifecycle; the caller (typically
|
||||
// the service runtime) opens and closes it.
|
||||
type Config struct {
|
||||
// Client stores the Redis client the store uses for every command.
|
||||
Client *redis.Client
|
||||
}
|
||||
|
||||
// Store persists the per-game lifecycle lease in Redis.
|
||||
type Store struct {
|
||||
client *redis.Client
|
||||
keys redisstate.Keyspace
|
||||
}
|
||||
|
||||
// New constructs one Redis-backed game lease store from cfg.
|
||||
func New(cfg Config) (*Store, error) {
|
||||
if cfg.Client == nil {
|
||||
return nil, errors.New("new rtmanager game lease store: nil redis client")
|
||||
}
|
||||
return &Store{
|
||||
client: cfg.Client,
|
||||
keys: redisstate.Keyspace{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// TryAcquire attempts to acquire the per-game lease for gameID owned by
|
||||
// token for ttl. The acquired return is true on a successful claim and
|
||||
// false when another caller still owns the lease. A non-nil error
|
||||
// reports a transport failure and must not be confused with a missed
|
||||
// lease.
|
||||
func (store *Store) TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (bool, error) {
|
||||
if store == nil || store.client == nil {
|
||||
return false, errors.New("try acquire game lease: nil store")
|
||||
}
|
||||
if ctx == nil {
|
||||
return false, errors.New("try acquire game lease: nil context")
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return false, errors.New("try acquire game lease: game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(token) == "" {
|
||||
return false, errors.New("try acquire game lease: token must not be empty")
|
||||
}
|
||||
if ttl <= 0 {
|
||||
return false, errors.New("try acquire game lease: ttl must be positive")
|
||||
}
|
||||
|
||||
acquired, err := store.client.SetNX(ctx, store.keys.GameLease(gameID), token, ttl).Result()
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("try acquire game lease: %w", err)
|
||||
}
|
||||
return acquired, nil
|
||||
}
|
||||
|
||||
// Release removes the per-game lease for gameID only when token still
|
||||
// matches the stored owner value. A token mismatch is a silent no-op.
|
||||
func (store *Store) Release(ctx context.Context, gameID, token string) error {
|
||||
if store == nil || store.client == nil {
|
||||
return errors.New("release game lease: nil store")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("release game lease: nil context")
|
||||
}
|
||||
if strings.TrimSpace(gameID) == "" {
|
||||
return errors.New("release game lease: game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(token) == "" {
|
||||
return errors.New("release game lease: token must not be empty")
|
||||
}
|
||||
|
||||
if err := releaseScript.Run(
|
||||
ctx,
|
||||
store.client,
|
||||
[]string{store.keys.GameLease(gameID)},
|
||||
token,
|
||||
).Err(); err != nil {
|
||||
return fmt.Errorf("release game lease: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion: Store implements ports.GameLeaseStore.
|
||||
var _ ports.GameLeaseStore = (*Store)(nil)
|
||||
@@ -0,0 +1,133 @@
|
||||
package gamelease_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/redisstate"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newLeaseStore(t *testing.T) (*gamelease.Store, *miniredis.Miniredis) {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
store, err := gamelease.New(gamelease.Config{Client: client})
|
||||
require.NoError(t, err)
|
||||
return store, server
|
||||
}
|
||||
|
||||
func TestNewRejectsNilClient(t *testing.T) {
|
||||
_, err := gamelease.New(gamelease.Config{})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestTryAcquireSetsKeyAndTTL(t *testing.T) {
|
||||
store, server := newLeaseStore(t)
|
||||
|
||||
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
|
||||
require.NoError(t, err)
|
||||
assert.True(t, acquired)
|
||||
|
||||
key := redisstate.Keyspace{}.GameLease("game-1")
|
||||
assert.True(t, server.Exists(key), "key %q must exist after TryAcquire", key)
|
||||
|
||||
stored, err := server.Get(key)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "token-A", stored)
|
||||
|
||||
// TTL must be positive (miniredis returns the remaining duration).
|
||||
ttl := server.TTL(key)
|
||||
assert.Greater(t, ttl, time.Duration(0))
|
||||
}
|
||||
|
||||
func TestTryAcquireReturnsFalseWhenAlreadyHeld(t *testing.T) {
|
||||
store, _ := newLeaseStore(t)
|
||||
|
||||
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
|
||||
require.NoError(t, err)
|
||||
require.True(t, acquired)
|
||||
|
||||
acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute)
|
||||
require.NoError(t, err)
|
||||
assert.False(t, acquired)
|
||||
}
|
||||
|
||||
func TestReleaseRemovesKeyForOwnerToken(t *testing.T) {
|
||||
store, server := newLeaseStore(t)
|
||||
|
||||
_, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, store.Release(context.Background(), "game-1", "token-A"))
|
||||
|
||||
key := redisstate.Keyspace{}.GameLease("game-1")
|
||||
assert.False(t, server.Exists(key), "key %q must be deleted after Release", key)
|
||||
}
|
||||
|
||||
func TestReleaseIsNoOpForForeignToken(t *testing.T) {
|
||||
store, server := newLeaseStore(t)
|
||||
|
||||
_, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, store.Release(context.Background(), "game-1", "token-B"))
|
||||
|
||||
key := redisstate.Keyspace{}.GameLease("game-1")
|
||||
assert.True(t, server.Exists(key), "key %q must still exist when foreign token is released", key)
|
||||
|
||||
stored, err := server.Get(key)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "token-A", stored)
|
||||
}
|
||||
|
||||
func TestTryAcquireSucceedsAfterTTLExpiry(t *testing.T) {
|
||||
store, server := newLeaseStore(t)
|
||||
|
||||
acquired, err := store.TryAcquire(context.Background(), "game-1", "token-A", time.Minute)
|
||||
require.NoError(t, err)
|
||||
require.True(t, acquired)
|
||||
|
||||
server.FastForward(2 * time.Minute)
|
||||
|
||||
acquired, err = store.TryAcquire(context.Background(), "game-1", "token-B", time.Minute)
|
||||
require.NoError(t, err)
|
||||
assert.True(t, acquired)
|
||||
}
|
||||
|
||||
func TestTryAcquireRejectsInvalidArguments(t *testing.T) {
|
||||
store, _ := newLeaseStore(t)
|
||||
|
||||
_, err := store.TryAcquire(context.Background(), "", "token", time.Minute)
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = store.TryAcquire(context.Background(), "game-1", "", time.Minute)
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = store.TryAcquire(context.Background(), "game-1", "token", 0)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestReleaseRejectsInvalidArguments(t *testing.T) {
|
||||
store, _ := newLeaseStore(t)
|
||||
|
||||
require.Error(t, store.Release(context.Background(), "", "token"))
|
||||
require.Error(t, store.Release(context.Background(), "game-1", ""))
|
||||
}
|
||||
|
||||
func TestKeyspaceGameLeaseIsPrefixedAndEncoded(t *testing.T) {
|
||||
key := redisstate.Keyspace{}.GameLease("game with spaces")
|
||||
assert.NotEmpty(t, key)
|
||||
assert.Contains(t, key, "rtmanager:game_lease:")
|
||||
suffix := key[len("rtmanager:game_lease:"):]
|
||||
// base64url-encoded suffix must not contain the original spaces.
|
||||
assert.NotContains(t, suffix, " ")
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// Package redisstate hosts the Runtime Manager Redis adapters that share
|
||||
// a single keyspace. Each sibling subpackage (e.g. `streamoffsets`)
|
||||
// implements one port and uses Keyspace to compose its keys, so the
|
||||
// Redis namespace stays under one document and one prefix.
|
||||
//
|
||||
// The package itself only declares the keyspace; concrete stores live in
|
||||
// nested packages so dependencies (testcontainers, miniredis) stay out
|
||||
// of consumer build graphs that do not need them.
|
||||
package redisstate
|
||||
|
||||
import "encoding/base64"
|
||||
|
||||
// defaultPrefix is the mandatory `rtmanager:` namespace prefix shared by
|
||||
// every Runtime Manager Redis key.
|
||||
const defaultPrefix = "rtmanager:"
|
||||
|
||||
// Keyspace builds the Runtime Manager Redis keys. The namespace covers
|
||||
// the stream consumer offsets and the per-game lifecycle lease in v1.
|
||||
//
|
||||
// Dynamic key segments are encoded with base64url so raw key structure
|
||||
// does not depend on caller-provided characters; this matches the
|
||||
// encoding chosen by `lobby/internal/adapters/redisstate.Keyspace`.
|
||||
type Keyspace struct{}
|
||||
|
||||
// StreamOffset returns the Redis key that stores the last successfully
|
||||
// processed entry id for one Redis Stream consumer. The streamLabel is
|
||||
// the short logical identifier of the consumer (e.g. `start_jobs`,
|
||||
// `stop_jobs`), not the full stream name; it stays stable when the
|
||||
// underlying stream key is renamed.
|
||||
func (Keyspace) StreamOffset(streamLabel string) string {
|
||||
return defaultPrefix + "stream_offsets:" + encodeKeyComponent(streamLabel)
|
||||
}
|
||||
|
||||
// GameLease returns the Redis key that stores the per-game lifecycle
|
||||
// lease guarding start / stop / restart / patch / cleanup operations
|
||||
// against the same game. The gameID is base64url-encoded so callers can
|
||||
// pass any opaque identifier without escaping raw key characters.
|
||||
func (Keyspace) GameLease(gameID string) string {
|
||||
return defaultPrefix + "game_lease:" + encodeKeyComponent(gameID)
|
||||
}
|
||||
|
||||
func encodeKeyComponent(value string) string {
|
||||
return base64.RawURLEncoding.EncodeToString([]byte(value))
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
// Package streamoffsets implements the Redis-backed adapter for
|
||||
// `ports.StreamOffsetStore`.
|
||||
//
|
||||
// The start-jobs and stop-jobs consumers call Load on startup to
|
||||
// resume from the persisted offset and Save after every successful
|
||||
// message handling. Keys are produced by
|
||||
// `redisstate.Keyspace.StreamOffset`, mirroring the lobby pattern.
|
||||
package streamoffsets
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/redisstate"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Config configures one Redis-backed stream-offset store instance. The
|
||||
// store does not own the redis client lifecycle; the caller (typically
|
||||
// the service runtime) opens and closes it.
|
||||
type Config struct {
|
||||
// Client stores the Redis client the store uses for every command.
|
||||
Client *redis.Client
|
||||
}
|
||||
|
||||
// Store persists Runtime Manager stream consumer offsets in Redis.
|
||||
type Store struct {
|
||||
client *redis.Client
|
||||
keys redisstate.Keyspace
|
||||
}
|
||||
|
||||
// New constructs one Redis-backed stream-offset store from cfg.
|
||||
func New(cfg Config) (*Store, error) {
|
||||
if cfg.Client == nil {
|
||||
return nil, errors.New("new rtmanager stream offset store: nil redis client")
|
||||
}
|
||||
return &Store{
|
||||
client: cfg.Client,
|
||||
keys: redisstate.Keyspace{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Load returns the last processed entry id for streamLabel when one is
|
||||
// stored. A missing key returns ("", false, nil).
|
||||
func (store *Store) Load(ctx context.Context, streamLabel string) (string, bool, error) {
|
||||
if store == nil || store.client == nil {
|
||||
return "", false, errors.New("load rtmanager stream offset: nil store")
|
||||
}
|
||||
if ctx == nil {
|
||||
return "", false, errors.New("load rtmanager stream offset: nil context")
|
||||
}
|
||||
if strings.TrimSpace(streamLabel) == "" {
|
||||
return "", false, errors.New("load rtmanager stream offset: stream label must not be empty")
|
||||
}
|
||||
|
||||
value, err := store.client.Get(ctx, store.keys.StreamOffset(streamLabel)).Result()
|
||||
switch {
|
||||
case errors.Is(err, redis.Nil):
|
||||
return "", false, nil
|
||||
case err != nil:
|
||||
return "", false, fmt.Errorf("load rtmanager stream offset: %w", err)
|
||||
}
|
||||
return value, true, nil
|
||||
}
|
||||
|
||||
// Save stores entryID as the new offset for streamLabel. The key has no
|
||||
// TTL — offsets are durable and only overwritten by subsequent Saves.
|
||||
func (store *Store) Save(ctx context.Context, streamLabel, entryID string) error {
|
||||
if store == nil || store.client == nil {
|
||||
return errors.New("save rtmanager stream offset: nil store")
|
||||
}
|
||||
if ctx == nil {
|
||||
return errors.New("save rtmanager stream offset: nil context")
|
||||
}
|
||||
if strings.TrimSpace(streamLabel) == "" {
|
||||
return errors.New("save rtmanager stream offset: stream label must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(entryID) == "" {
|
||||
return errors.New("save rtmanager stream offset: entry id must not be empty")
|
||||
}
|
||||
|
||||
if err := store.client.Set(ctx, store.keys.StreamOffset(streamLabel), entryID, 0).Err(); err != nil {
|
||||
return fmt.Errorf("save rtmanager stream offset: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure Store satisfies the ports.StreamOffsetStore interface at
|
||||
// compile time.
|
||||
var _ ports.StreamOffsetStore = (*Store)(nil)
|
||||
@@ -0,0 +1,86 @@
|
||||
package streamoffsets_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/redisstate"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newOffsetStore(t *testing.T) (*streamoffsets.Store, *miniredis.Miniredis) {
|
||||
t.Helper()
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{Addr: server.Addr()})
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
store, err := streamoffsets.New(streamoffsets.Config{Client: client})
|
||||
require.NoError(t, err)
|
||||
return store, server
|
||||
}
|
||||
|
||||
func TestNewRejectsNilClient(t *testing.T) {
|
||||
_, err := streamoffsets.New(streamoffsets.Config{})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestLoadMissingReturnsNotFound(t *testing.T) {
|
||||
store, _ := newOffsetStore(t)
|
||||
|
||||
id, found, err := store.Load(context.Background(), "start_jobs")
|
||||
require.NoError(t, err)
|
||||
assert.False(t, found)
|
||||
assert.Empty(t, id)
|
||||
}
|
||||
|
||||
func TestSaveLoadRoundTrip(t *testing.T) {
|
||||
store, server := newOffsetStore(t)
|
||||
|
||||
require.NoError(t, store.Save(context.Background(), "start_jobs", "1700000000000-0"))
|
||||
|
||||
id, found, err := store.Load(context.Background(), "start_jobs")
|
||||
require.NoError(t, err)
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, "1700000000000-0", id)
|
||||
|
||||
// The persisted key must follow the rtmanager keyspace prefix.
|
||||
expectedKey := redisstate.Keyspace{}.StreamOffset("start_jobs")
|
||||
assert.True(t, server.Exists(expectedKey),
|
||||
"key %q must exist after Save", expectedKey)
|
||||
}
|
||||
|
||||
func TestSaveOverwritesPriorValue(t *testing.T) {
|
||||
store, _ := newOffsetStore(t)
|
||||
|
||||
require.NoError(t, store.Save(context.Background(), "start_jobs", "100-0"))
|
||||
require.NoError(t, store.Save(context.Background(), "start_jobs", "200-0"))
|
||||
|
||||
id, found, err := store.Load(context.Background(), "start_jobs")
|
||||
require.NoError(t, err)
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, "200-0", id)
|
||||
}
|
||||
|
||||
func TestLoadAndSaveRejectInvalidArguments(t *testing.T) {
|
||||
store, _ := newOffsetStore(t)
|
||||
|
||||
require.Error(t, store.Save(context.Background(), "", "100-0"))
|
||||
require.Error(t, store.Save(context.Background(), "start_jobs", ""))
|
||||
|
||||
_, _, err := store.Load(context.Background(), "")
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestKeyspaceStreamOffsetIsPrefixed(t *testing.T) {
|
||||
key := redisstate.Keyspace{}.StreamOffset("start_jobs")
|
||||
assert.NotEmpty(t, key)
|
||||
assert.Contains(t, key, "rtmanager:stream_offsets:")
|
||||
// base64url-encoded label must not contain raw colons or spaces.
|
||||
suffix := key[len("rtmanager:stream_offsets:"):]
|
||||
assert.NotContains(t, suffix, ":")
|
||||
}
|
||||
@@ -0,0 +1,367 @@
|
||||
package internalhttp
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/api/internalhttp/handlers"
|
||||
domainruntime "galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
|
||||
"github.com/getkin/kin-openapi/openapi3"
|
||||
"github.com/getkin/kin-openapi/openapi3filter"
|
||||
"github.com/getkin/kin-openapi/routers"
|
||||
"github.com/getkin/kin-openapi/routers/legacy"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestInternalRESTConformance loads the OpenAPI specification, drives
|
||||
// every runtime operation against the live internal HTTP listener
|
||||
// backed by stub services, and validates each response body against
|
||||
// the spec via `openapi3filter.ValidateResponse`. The test catches
|
||||
// drift between the wire shape produced by the handler layer and the
|
||||
// frozen contract; failure-path response shapes are validated by the
|
||||
// per-handler tests in `handlers/<op>_test.go`.
|
||||
func TestInternalRESTConformance(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
doc := loadConformanceSpec(t)
|
||||
|
||||
router, err := legacy.NewRouter(doc)
|
||||
require.NoError(t, err)
|
||||
|
||||
deps := newConformanceDeps(t)
|
||||
server, err := NewServer(newConformanceConfig(), Dependencies{
|
||||
Logger: nil,
|
||||
Telemetry: nil,
|
||||
Readiness: nil,
|
||||
RuntimeRecords: deps.records,
|
||||
StartRuntime: deps.start,
|
||||
StopRuntime: deps.stop,
|
||||
RestartRuntime: deps.restart,
|
||||
PatchRuntime: deps.patch,
|
||||
CleanupContainer: deps.cleanup,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
cases := []conformanceCase{
|
||||
{
|
||||
name: "internalListRuntimes",
|
||||
method: http.MethodGet,
|
||||
path: "/api/v1/internal/runtimes",
|
||||
},
|
||||
{
|
||||
name: "internalGetRuntime",
|
||||
method: http.MethodGet,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID,
|
||||
},
|
||||
{
|
||||
name: "internalStartRuntime",
|
||||
method: http.MethodPost,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/start",
|
||||
contentType: "application/json",
|
||||
body: `{"image_ref":"galaxy/game:v1.2.3"}`,
|
||||
},
|
||||
{
|
||||
name: "internalStopRuntime",
|
||||
method: http.MethodPost,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/stop",
|
||||
contentType: "application/json",
|
||||
body: `{"reason":"admin_request"}`,
|
||||
},
|
||||
{
|
||||
name: "internalRestartRuntime",
|
||||
method: http.MethodPost,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/restart",
|
||||
},
|
||||
{
|
||||
name: "internalPatchRuntime",
|
||||
method: http.MethodPost,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/patch",
|
||||
contentType: "application/json",
|
||||
body: `{"image_ref":"galaxy/game:v1.2.4"}`,
|
||||
},
|
||||
{
|
||||
name: "internalCleanupRuntimeContainer",
|
||||
method: http.MethodDelete,
|
||||
path: "/api/v1/internal/runtimes/" + conformanceGameID + "/container",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
runConformanceCase(t, server.handler, router, tc)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// conformanceGameID is the path variable used for every per-game
|
||||
// conformance request.
|
||||
const conformanceGameID = "game-conformance"
|
||||
|
||||
// conformanceServerURL mirrors the canonical `servers[0].url` entry in
|
||||
// `rtmanager/api/internal-openapi.yaml`. The legacy router matches
|
||||
// requests against this prefix; updating the spec's server URL
|
||||
// requires updating this constant.
|
||||
const conformanceServerURL = "http://localhost:8096"
|
||||
|
||||
// conformanceCase describes one request the conformance test drives.
|
||||
type conformanceCase struct {
|
||||
name string
|
||||
method string
|
||||
path string
|
||||
contentType string
|
||||
body string
|
||||
}
|
||||
|
||||
func runConformanceCase(t *testing.T, handler http.Handler, router routers.Router, tc conformanceCase) {
|
||||
t.Helper()
|
||||
|
||||
// Drive the handler with the path-only form so the listener's
|
||||
// http.ServeMux matches the registered routes (which use raw paths,
|
||||
// without the OpenAPI server URL prefix).
|
||||
var bodyReader io.Reader
|
||||
if tc.body != "" {
|
||||
bodyReader = strings.NewReader(tc.body)
|
||||
}
|
||||
request := httptest.NewRequest(tc.method, tc.path, bodyReader)
|
||||
if tc.contentType != "" {
|
||||
request.Header.Set("Content-Type", tc.contentType)
|
||||
}
|
||||
request.Header.Set("X-Galaxy-Caller", "admin")
|
||||
|
||||
recorder := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recorder, request)
|
||||
require.Equalf(t, http.StatusOK, recorder.Code, "operation %s returned %d: %s", tc.name, recorder.Code, recorder.Body.String())
|
||||
|
||||
// kin-openapi's legacy router requires the request URL to match a
|
||||
// `servers[].url` entry; rebuild the validation request with the
|
||||
// canonical local server URL declared in the spec.
|
||||
validationURL := conformanceServerURL + tc.path
|
||||
validationRequest := httptest.NewRequest(tc.method, validationURL, bodyReaderFor(tc.body))
|
||||
if tc.contentType != "" {
|
||||
validationRequest.Header.Set("Content-Type", tc.contentType)
|
||||
}
|
||||
validationRequest.Header.Set("X-Galaxy-Caller", "admin")
|
||||
|
||||
route, pathParams, err := router.FindRoute(validationRequest)
|
||||
require.NoError(t, err)
|
||||
|
||||
requestInput := &openapi3filter.RequestValidationInput{
|
||||
Request: validationRequest,
|
||||
PathParams: pathParams,
|
||||
Route: route,
|
||||
Options: &openapi3filter.Options{
|
||||
IncludeResponseStatus: true,
|
||||
},
|
||||
}
|
||||
require.NoError(t, openapi3filter.ValidateRequest(context.Background(), requestInput))
|
||||
|
||||
responseInput := &openapi3filter.ResponseValidationInput{
|
||||
RequestValidationInput: requestInput,
|
||||
Status: recorder.Code,
|
||||
Header: recorder.Header(),
|
||||
Options: &openapi3filter.Options{
|
||||
IncludeResponseStatus: true,
|
||||
},
|
||||
}
|
||||
responseInput.SetBodyBytes(recorder.Body.Bytes())
|
||||
require.NoError(t, openapi3filter.ValidateResponse(context.Background(), responseInput))
|
||||
}
|
||||
|
||||
func loadConformanceSpec(t *testing.T) *openapi3.T {
|
||||
t.Helper()
|
||||
|
||||
_, thisFile, _, ok := runtime.Caller(0)
|
||||
require.True(t, ok)
|
||||
|
||||
specPath := filepath.Join(filepath.Dir(thisFile), "..", "..", "..", "api", "internal-openapi.yaml")
|
||||
loader := openapi3.NewLoader()
|
||||
doc, err := loader.LoadFromFile(specPath)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, doc.Validate(context.Background()))
|
||||
return doc
|
||||
}
|
||||
|
||||
func bodyReaderFor(raw string) io.Reader {
|
||||
if raw == "" {
|
||||
return http.NoBody
|
||||
}
|
||||
return bytes.NewBufferString(raw)
|
||||
}
|
||||
|
||||
// conformanceDeps groups the stub collaborators handed to the listener.
|
||||
type conformanceDeps struct {
|
||||
records *conformanceRecords
|
||||
start *conformanceStart
|
||||
stop *conformanceStop
|
||||
restart *conformanceRestart
|
||||
patch *conformancePatch
|
||||
cleanup *conformanceCleanup
|
||||
}
|
||||
|
||||
func newConformanceDeps(t *testing.T) *conformanceDeps {
|
||||
t.Helper()
|
||||
return &conformanceDeps{
|
||||
records: newConformanceRecords(),
|
||||
start: &conformanceStart{},
|
||||
stop: &conformanceStop{},
|
||||
restart: &conformanceRestart{},
|
||||
patch: &conformancePatch{},
|
||||
cleanup: &conformanceCleanup{},
|
||||
}
|
||||
}
|
||||
|
||||
func newConformanceConfig() Config {
|
||||
return Config{
|
||||
Addr: ":0",
|
||||
ReadHeaderTimeout: time.Second,
|
||||
ReadTimeout: time.Second,
|
||||
WriteTimeout: time.Second,
|
||||
IdleTimeout: time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// conformanceRecord builds a canonical running record used by every
|
||||
// stub service.
|
||||
func conformanceRecord() domainruntime.RuntimeRecord {
|
||||
started := time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC)
|
||||
return domainruntime.RuntimeRecord{
|
||||
GameID: conformanceGameID,
|
||||
Status: domainruntime.StatusRunning,
|
||||
CurrentContainerID: "container-conformance",
|
||||
CurrentImageRef: "galaxy/game:v1.2.3",
|
||||
EngineEndpoint: "http://galaxy-game-" + conformanceGameID + ":8080",
|
||||
StatePath: "/var/lib/galaxy/" + conformanceGameID,
|
||||
DockerNetwork: "galaxy-engine",
|
||||
StartedAt: &started,
|
||||
LastOpAt: started,
|
||||
CreatedAt: started,
|
||||
}
|
||||
}
|
||||
|
||||
// conformanceRecords is an in-memory record store seeded with one
|
||||
// canonical record so the get / list endpoints have something to
|
||||
// return.
|
||||
type conformanceRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]domainruntime.RuntimeRecord
|
||||
}
|
||||
|
||||
func newConformanceRecords() *conformanceRecords {
|
||||
return &conformanceRecords{
|
||||
stored: map[string]domainruntime.RuntimeRecord{
|
||||
conformanceGameID: conformanceRecord(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *conformanceRecords) Get(_ context.Context, gameID string) (domainruntime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return domainruntime.RuntimeRecord{}, domainruntime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *conformanceRecords) Upsert(_ context.Context, _ domainruntime.RuntimeRecord) error {
|
||||
return errors.New("not used in conformance test")
|
||||
}
|
||||
|
||||
func (s *conformanceRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in conformance test")
|
||||
}
|
||||
|
||||
func (s *conformanceRecords) ListByStatus(_ context.Context, _ domainruntime.Status) ([]domainruntime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in conformance test")
|
||||
}
|
||||
|
||||
func (s *conformanceRecords) List(_ context.Context) ([]domainruntime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]domainruntime.RuntimeRecord, 0, len(s.stored))
|
||||
for _, record := range s.stored {
|
||||
out = append(out, record)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// conformanceStart is the stub StartService used by the conformance
|
||||
// test. Every Handle call returns the canonical record.
|
||||
type conformanceStart struct{}
|
||||
|
||||
func (s *conformanceStart) Handle(_ context.Context, _ startruntime.Input) (startruntime.Result, error) {
|
||||
return startruntime.Result{
|
||||
Record: conformanceRecord(),
|
||||
Outcome: "success",
|
||||
}, nil
|
||||
}
|
||||
|
||||
type conformanceStop struct{}
|
||||
|
||||
func (s *conformanceStop) Handle(_ context.Context, _ stopruntime.Input) (stopruntime.Result, error) {
|
||||
rec := conformanceRecord()
|
||||
rec.Status = domainruntime.StatusStopped
|
||||
stopped := rec.LastOpAt.Add(time.Second)
|
||||
rec.StoppedAt = &stopped
|
||||
rec.LastOpAt = stopped
|
||||
return stopruntime.Result{Record: rec, Outcome: "success"}, nil
|
||||
}
|
||||
|
||||
type conformanceRestart struct{}
|
||||
|
||||
func (s *conformanceRestart) Handle(_ context.Context, _ restartruntime.Input) (restartruntime.Result, error) {
|
||||
return restartruntime.Result{Record: conformanceRecord(), Outcome: "success"}, nil
|
||||
}
|
||||
|
||||
type conformancePatch struct{}
|
||||
|
||||
func (s *conformancePatch) Handle(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
|
||||
rec := conformanceRecord()
|
||||
if in.NewImageRef != "" {
|
||||
rec.CurrentImageRef = in.NewImageRef
|
||||
}
|
||||
return patchruntime.Result{Record: rec, Outcome: "success"}, nil
|
||||
}
|
||||
|
||||
type conformanceCleanup struct{}
|
||||
|
||||
func (s *conformanceCleanup) Handle(_ context.Context, _ cleanupcontainer.Input) (cleanupcontainer.Result, error) {
|
||||
rec := conformanceRecord()
|
||||
rec.Status = domainruntime.StatusRemoved
|
||||
rec.CurrentContainerID = ""
|
||||
removed := rec.LastOpAt.Add(time.Minute)
|
||||
rec.RemovedAt = &removed
|
||||
rec.LastOpAt = removed
|
||||
return cleanupcontainer.Result{Record: rec, Outcome: "success"}, nil
|
||||
}
|
||||
|
||||
// Compile-time guards: the stubs must satisfy the handler-level
|
||||
// service ports plus ports.RuntimeRecordStore so the listener accepts
|
||||
// them.
|
||||
var (
|
||||
_ handlers.StartService = (*conformanceStart)(nil)
|
||||
_ handlers.StopService = (*conformanceStop)(nil)
|
||||
_ handlers.RestartService = (*conformanceRestart)(nil)
|
||||
_ handlers.PatchService = (*conformancePatch)(nil)
|
||||
_ handlers.CleanupService = (*conformanceCleanup)(nil)
|
||||
_ ports.RuntimeRecordStore = (*conformanceRecords)(nil)
|
||||
)
|
||||
@@ -0,0 +1,55 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// newCleanupHandler returns the handler for
|
||||
// `DELETE /api/v1/internal/runtimes/{game_id}/container`. The OpenAPI
|
||||
// spec declares no request body for this operation; any client-provided
|
||||
// body is ignored.
|
||||
func newCleanupHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.cleanup")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.CleanupContainer == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"cleanup container service is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
result, err := deps.CleanupContainer.Handle(request.Context(), cleanupcontainer.Input{
|
||||
GameID: gameID,
|
||||
OpSource: resolveOpSource(request),
|
||||
SourceRef: requestSourceRef(request),
|
||||
})
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "cleanup container service errored",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"cleanup container service failed",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// JSONContentType is the Content-Type used by every internal REST
|
||||
// response. Exported so the listener-level tests can match it without
|
||||
// re-declaring the constant.
|
||||
const JSONContentType = "application/json; charset=utf-8"
|
||||
|
||||
// gameIDPathParam is the name of the {game_id} path variable shared by
|
||||
// every per-game runtime endpoint.
|
||||
const gameIDPathParam = "game_id"
|
||||
|
||||
// callerHeader is the HTTP header that distinguishes Game Master from
|
||||
// Admin Service in the operation log. Documented in
|
||||
// `rtmanager/api/internal-openapi.yaml` and
|
||||
// `rtmanager/docs/services.md` §18.
|
||||
const callerHeader = "X-Galaxy-Caller"
|
||||
|
||||
// errorCodeDockerUnavailable mirrors the OpenAPI error code value. The
|
||||
// lifecycle services do not currently emit it (they use
|
||||
// `service_unavailable` for Docker daemon failures); the handler layer
|
||||
// maps it to 503 anyway so future producers do not require a handler
|
||||
// change.
|
||||
const errorCodeDockerUnavailable = "docker_unavailable"
|
||||
|
||||
// errorBody mirrors the `error` element of the OpenAPI ErrorResponse
|
||||
// schema.
|
||||
type errorBody struct {
|
||||
Code string `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// errorResponse mirrors the OpenAPI ErrorResponse envelope.
|
||||
type errorResponse struct {
|
||||
Error errorBody `json:"error"`
|
||||
}
|
||||
|
||||
// runtimeRecordResponse mirrors the OpenAPI RuntimeRecord schema.
|
||||
// Required fields use plain strings; nullable fields use pointers so an
|
||||
// absent value encodes as the JSON literal `null` (matches the
|
||||
// `nullable: true` declaration in the spec). Times are RFC3339 UTC.
|
||||
type runtimeRecordResponse struct {
|
||||
GameID string `json:"game_id"`
|
||||
Status string `json:"status"`
|
||||
CurrentContainerID *string `json:"current_container_id"`
|
||||
CurrentImageRef *string `json:"current_image_ref"`
|
||||
EngineEndpoint *string `json:"engine_endpoint"`
|
||||
StatePath string `json:"state_path"`
|
||||
DockerNetwork string `json:"docker_network"`
|
||||
StartedAt *string `json:"started_at"`
|
||||
StoppedAt *string `json:"stopped_at"`
|
||||
RemovedAt *string `json:"removed_at"`
|
||||
LastOpAt string `json:"last_op_at"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// runtimesListResponse mirrors the OpenAPI RuntimesList schema. Items
|
||||
// is always non-nil so the JSON form carries `[]` rather than `null`
|
||||
// for an empty result.
|
||||
type runtimesListResponse struct {
|
||||
Items []runtimeRecordResponse `json:"items"`
|
||||
}
|
||||
|
||||
// encodeRuntimeRecord turns a domain RuntimeRecord into its wire shape.
|
||||
func encodeRuntimeRecord(record runtime.RuntimeRecord) runtimeRecordResponse {
|
||||
resp := runtimeRecordResponse{
|
||||
GameID: record.GameID,
|
||||
Status: string(record.Status),
|
||||
StatePath: record.StatePath,
|
||||
DockerNetwork: record.DockerNetwork,
|
||||
LastOpAt: record.LastOpAt.UTC().Format(time.RFC3339Nano),
|
||||
CreatedAt: record.CreatedAt.UTC().Format(time.RFC3339Nano),
|
||||
}
|
||||
if record.CurrentContainerID != "" {
|
||||
v := record.CurrentContainerID
|
||||
resp.CurrentContainerID = &v
|
||||
}
|
||||
if record.CurrentImageRef != "" {
|
||||
v := record.CurrentImageRef
|
||||
resp.CurrentImageRef = &v
|
||||
}
|
||||
if record.EngineEndpoint != "" {
|
||||
v := record.EngineEndpoint
|
||||
resp.EngineEndpoint = &v
|
||||
}
|
||||
if record.StartedAt != nil {
|
||||
v := record.StartedAt.UTC().Format(time.RFC3339Nano)
|
||||
resp.StartedAt = &v
|
||||
}
|
||||
if record.StoppedAt != nil {
|
||||
v := record.StoppedAt.UTC().Format(time.RFC3339Nano)
|
||||
resp.StoppedAt = &v
|
||||
}
|
||||
if record.RemovedAt != nil {
|
||||
v := record.RemovedAt.UTC().Format(time.RFC3339Nano)
|
||||
resp.RemovedAt = &v
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// encodeRuntimesList builds the wire shape returned by the list handler.
|
||||
// records may be nil (empty store); the result still carries an empty
|
||||
// items slice so the JSON form is `{"items":[]}`.
|
||||
func encodeRuntimesList(records []runtime.RuntimeRecord) runtimesListResponse {
|
||||
resp := runtimesListResponse{
|
||||
Items: make([]runtimeRecordResponse, 0, len(records)),
|
||||
}
|
||||
for _, record := range records {
|
||||
resp.Items = append(resp.Items, encodeRuntimeRecord(record))
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// writeJSON writes payload as a JSON response with the given status code.
|
||||
func writeJSON(writer http.ResponseWriter, statusCode int, payload any) {
|
||||
writer.Header().Set("Content-Type", JSONContentType)
|
||||
writer.WriteHeader(statusCode)
|
||||
_ = json.NewEncoder(writer).Encode(payload)
|
||||
}
|
||||
|
||||
// writeError writes the canonical error envelope at statusCode.
|
||||
func writeError(writer http.ResponseWriter, statusCode int, code, message string) {
|
||||
writeJSON(writer, statusCode, errorResponse{
|
||||
Error: errorBody{Code: code, Message: message},
|
||||
})
|
||||
}
|
||||
|
||||
// writeFailure writes the canonical error envelope using the HTTP
|
||||
// status mapped from code. Used by every lifecycle handler when its
|
||||
// service returns `Outcome=failure`.
|
||||
func writeFailure(writer http.ResponseWriter, code, message string) {
|
||||
writeError(writer, mapErrorCodeToStatus(code), code, message)
|
||||
}
|
||||
|
||||
// mapErrorCodeToStatus maps a stable error code to the HTTP status
|
||||
// declared by `rtmanager/api/internal-openapi.yaml`. Unknown codes
|
||||
// degrade to 500 so a future error code that ships ahead of its
|
||||
// handler-layer mapping still produces a structurally valid response.
|
||||
func mapErrorCodeToStatus(code string) int {
|
||||
switch code {
|
||||
case startruntime.ErrorCodeInvalidRequest,
|
||||
startruntime.ErrorCodeStartConfigInvalid,
|
||||
startruntime.ErrorCodeImageRefNotSemver:
|
||||
return http.StatusBadRequest
|
||||
case startruntime.ErrorCodeNotFound:
|
||||
return http.StatusNotFound
|
||||
case startruntime.ErrorCodeConflict,
|
||||
startruntime.ErrorCodeSemverPatchOnly:
|
||||
return http.StatusConflict
|
||||
case startruntime.ErrorCodeServiceUnavailable,
|
||||
errorCodeDockerUnavailable:
|
||||
return http.StatusServiceUnavailable
|
||||
case startruntime.ErrorCodeImagePullFailed,
|
||||
startruntime.ErrorCodeContainerStartFailed,
|
||||
startruntime.ErrorCodeInternal:
|
||||
return http.StatusInternalServerError
|
||||
default:
|
||||
return http.StatusInternalServerError
|
||||
}
|
||||
}
|
||||
|
||||
// decodeStrictJSON decodes one request body into target with strict
|
||||
// JSON semantics: unknown fields are rejected and trailing content is
|
||||
// rejected. Mirrors the helper used by lobby's internal HTTP layer.
|
||||
func decodeStrictJSON(body io.Reader, target any) error {
|
||||
decoder := json.NewDecoder(body)
|
||||
decoder.DisallowUnknownFields()
|
||||
if err := decoder.Decode(target); err != nil {
|
||||
return err
|
||||
}
|
||||
if decoder.More() {
|
||||
return errors.New("unexpected trailing content after JSON body")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractGameID pulls the {game_id} path variable from request. An empty
|
||||
// or whitespace-only value writes a `400 invalid_request` and returns
|
||||
// ok=false so callers can short-circuit.
|
||||
func extractGameID(writer http.ResponseWriter, request *http.Request) (string, bool) {
|
||||
raw := request.PathValue(gameIDPathParam)
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
writeError(writer, http.StatusBadRequest,
|
||||
startruntime.ErrorCodeInvalidRequest,
|
||||
"game id is required",
|
||||
)
|
||||
return "", false
|
||||
}
|
||||
return raw, true
|
||||
}
|
||||
|
||||
// resolveOpSource maps the X-Galaxy-Caller header to an
|
||||
// `operation.OpSource`. Missing or unknown values default to
|
||||
// `OpSourceAdminRest`, matching the contract documented in
|
||||
// `rtmanager/api/internal-openapi.yaml`.
|
||||
func resolveOpSource(request *http.Request) operation.OpSource {
|
||||
switch strings.ToLower(strings.TrimSpace(request.Header.Get(callerHeader))) {
|
||||
case "gm":
|
||||
return operation.OpSourceGMRest
|
||||
default:
|
||||
return operation.OpSourceAdminRest
|
||||
}
|
||||
}
|
||||
|
||||
// requestSourceRef returns an opaque per-request reference recorded in
|
||||
// `operation_log.source_ref`. v1 reads the `X-Request-ID` header when
|
||||
// present so callers may correlate REST requests with audit rows; the
|
||||
// listener does not currently install a request-id middleware so the
|
||||
// header path is the only source.
|
||||
func requestSourceRef(request *http.Request) string {
|
||||
if v := strings.TrimSpace(request.Header.Get("X-Request-ID")); v != "" {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// loggerFor returns a logger annotated with the operation tag. Each
|
||||
// handler scopes its logs by op so operators filtering on
|
||||
// `op=internal_rest.start` see exactly the lifecycle they care about.
|
||||
func loggerFor(parent *slog.Logger, op string) *slog.Logger {
|
||||
if parent == nil {
|
||||
parent = slog.Default()
|
||||
}
|
||||
return parent.With("component", "internal_http.handlers", "op", op)
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// fixedClock is the wall-clock used to build canonical sample records
|
||||
// across the handler tests. UTC Sunday 1pm 2026-04-26 is far enough in
|
||||
// the future to be obvious in test output.
|
||||
var fixedClock = time.Date(2026, 4, 26, 13, 0, 0, 0, time.UTC)
|
||||
|
||||
// sampleRunningRecord returns a canonical running record used by every
|
||||
// happy-path test in this package.
|
||||
func sampleRunningRecord(t *testing.T) runtime.RuntimeRecord {
|
||||
t.Helper()
|
||||
started := fixedClock
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-test",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "container-test",
|
||||
CurrentImageRef: "galaxy/game:v1.2.3",
|
||||
EngineEndpoint: "http://galaxy-game-game-test:8080",
|
||||
StatePath: "/var/lib/galaxy/game-test",
|
||||
DockerNetwork: "galaxy-engine",
|
||||
StartedAt: &started,
|
||||
LastOpAt: fixedClock,
|
||||
CreatedAt: fixedClock,
|
||||
}
|
||||
}
|
||||
|
||||
// sampleStoppedRecord returns a canonical stopped record useful for
|
||||
// cleanup-handler and list-handler tests.
|
||||
func sampleStoppedRecord(t *testing.T) runtime.RuntimeRecord {
|
||||
t.Helper()
|
||||
started := fixedClock
|
||||
stopped := fixedClock.Add(time.Minute)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-stopped",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "container-stopped",
|
||||
CurrentImageRef: "galaxy/game:v1.2.3",
|
||||
EngineEndpoint: "http://galaxy-game-game-stopped:8080",
|
||||
StatePath: "/var/lib/galaxy/game-stopped",
|
||||
DockerNetwork: "galaxy-engine",
|
||||
StartedAt: &started,
|
||||
StoppedAt: &stopped,
|
||||
LastOpAt: stopped,
|
||||
CreatedAt: fixedClock,
|
||||
}
|
||||
}
|
||||
|
||||
// drive routes one request through a full mux configured by Register.
|
||||
// It returns the captured ResponseRecorder so tests can assert on
|
||||
// status, headers, and body.
|
||||
func drive(t *testing.T, deps Dependencies, method, path string, headers http.Header, body io.Reader) *httptest.ResponseRecorder {
|
||||
t.Helper()
|
||||
|
||||
mux := http.NewServeMux()
|
||||
Register(mux, deps)
|
||||
|
||||
request := httptest.NewRequest(method, path, body)
|
||||
for key, values := range headers {
|
||||
for _, value := range values {
|
||||
request.Header.Add(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
recorder := httptest.NewRecorder()
|
||||
mux.ServeHTTP(recorder, request)
|
||||
return recorder
|
||||
}
|
||||
|
||||
// decodeRecordResponse asserts that the response carried a 200 with
|
||||
// the canonical content type and decodes the record body.
|
||||
func decodeRecordResponse(t *testing.T, rec *httptest.ResponseRecorder) runtimeRecordResponse {
|
||||
t.Helper()
|
||||
require.Equalf(t, http.StatusOK, rec.Code, "expected 200, got body: %s", rec.Body.String())
|
||||
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
|
||||
|
||||
var resp runtimeRecordResponse
|
||||
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
|
||||
return resp
|
||||
}
|
||||
|
||||
// decodeErrorBody asserts the canonical error envelope and decodes it.
|
||||
func decodeErrorBody(t *testing.T, rec *httptest.ResponseRecorder, wantStatus int) errorBody {
|
||||
t.Helper()
|
||||
require.Equalf(t, wantStatus, rec.Code, "expected %d, got body: %s", wantStatus, rec.Body.String())
|
||||
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
|
||||
|
||||
var resp errorResponse
|
||||
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
|
||||
return resp.Error
|
||||
}
|
||||
|
||||
// fakeRuntimeRecords is an in-memory ports.RuntimeRecordStore used by
|
||||
// list / get tests. It is intentionally minimal — services use their
|
||||
// own fakes in `internal/service/<op>/service_test.go` and do not
|
||||
// share this helper.
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
listErr error
|
||||
getErr error
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) put(record runtime.RuntimeRecord) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.stored[record.GameID] = record
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
|
||||
return errors.New("not used in handler tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in handler tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in handler tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.listErr != nil {
|
||||
return nil, s.listErr
|
||||
}
|
||||
if len(s.stored) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
records := make([]runtime.RuntimeRecord, 0, len(s.stored))
|
||||
for _, record := range s.stored {
|
||||
records = append(records, record)
|
||||
}
|
||||
return records, nil
|
||||
}
|
||||
|
||||
// jsonHeaders returns the default headers used by tests that send a
|
||||
// JSON body.
|
||||
func jsonHeaders() http.Header {
|
||||
h := http.Header{}
|
||||
h.Set("Content-Type", "application/json")
|
||||
return h
|
||||
}
|
||||
|
||||
// withCaller adds the X-Galaxy-Caller header to h and returns h. The
|
||||
// helper exists to keep test cases readable when the header is the
|
||||
// only difference between two table rows.
|
||||
func withCaller(h http.Header, value string) http.Header {
|
||||
if h == nil {
|
||||
h = http.Header{}
|
||||
}
|
||||
h.Set(callerHeader, value)
|
||||
return h
|
||||
}
|
||||
|
||||
// strReader builds an io.Reader from raw JSON.
|
||||
func strReader(raw string) io.Reader {
|
||||
return strings.NewReader(raw)
|
||||
}
|
||||
|
||||
// Compile-time assertions that the in-memory fake satisfies the port.
|
||||
var _ ports.RuntimeRecordStore = (*fakeRuntimeRecords)(nil)
|
||||
@@ -0,0 +1,55 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// newGetHandler returns the handler for
|
||||
// `GET /api/v1/internal/runtimes/{game_id}`. The handler reads
|
||||
// directly from the runtime record store and translates
|
||||
// `runtime.ErrNotFound` to `404 not_found`. Like list, it does not
|
||||
// run through the service layer and does not produce an operation_log
|
||||
// row.
|
||||
func newGetHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.get")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.RuntimeRecords == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"runtime records store is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
record, err := deps.RuntimeRecords.Get(request.Context(), gameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
writeError(writer, http.StatusNotFound,
|
||||
startruntime.ErrorCodeNotFound,
|
||||
"runtime record not found",
|
||||
)
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "get runtime record",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"failed to read runtime record",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
)
|
||||
|
||||
// Route paths registered by Register. The values match the operation
|
||||
// IDs frozen by `rtmanager/api/internal-openapi.yaml` and
|
||||
// `rtmanager/contract_openapi_test.go`.
|
||||
const (
|
||||
listRuntimesPath = "/api/v1/internal/runtimes"
|
||||
getRuntimePath = "/api/v1/internal/runtimes/{game_id}"
|
||||
startRuntimePath = "/api/v1/internal/runtimes/{game_id}/start"
|
||||
stopRuntimePath = "/api/v1/internal/runtimes/{game_id}/stop"
|
||||
restartRuntimePath = "/api/v1/internal/runtimes/{game_id}/restart"
|
||||
patchRuntimePath = "/api/v1/internal/runtimes/{game_id}/patch"
|
||||
cleanupRuntimePath = "/api/v1/internal/runtimes/{game_id}/container"
|
||||
)
|
||||
|
||||
// Dependencies bundles the collaborators required to serve the GM/Admin
|
||||
// REST surface. Any service may be nil for tests that exercise a
|
||||
// subset of the surface; in that case the unwired routes return
|
||||
// `500 internal_error` (mirrors lobby's "service is not wired"
|
||||
// pattern).
|
||||
type Dependencies struct {
|
||||
// Logger receives structured logs scoped per handler. nil falls back
|
||||
// to slog.Default.
|
||||
Logger *slog.Logger
|
||||
|
||||
// RuntimeRecords backs the read-only list and get handlers. They do
|
||||
// not produce operation_log rows because they do not mutate state.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// StartRuntime executes the start lifecycle operation. Production
|
||||
// wiring passes `*startruntime.Service` (the concrete service
|
||||
// satisfies StartService).
|
||||
StartRuntime StartService
|
||||
|
||||
// StopRuntime executes the stop lifecycle operation.
|
||||
StopRuntime StopService
|
||||
|
||||
// RestartRuntime executes the restart lifecycle operation.
|
||||
RestartRuntime RestartService
|
||||
|
||||
// PatchRuntime executes the patch lifecycle operation.
|
||||
PatchRuntime PatchService
|
||||
|
||||
// CleanupContainer executes the cleanup_container lifecycle
|
||||
// operation.
|
||||
CleanupContainer CleanupService
|
||||
}
|
||||
|
||||
// Register attaches every internal REST route to mux using deps. Each
|
||||
// route reads its dependency lazily so a partially-wired Dependencies
|
||||
// (e.g., a probe-only listener test) does not crash; missing
|
||||
// dependencies surface as `500 internal_error`. Routes use Go 1.22
|
||||
// method-aware mux patterns.
|
||||
func Register(mux *http.ServeMux, deps Dependencies) {
|
||||
mux.HandleFunc("GET "+listRuntimesPath, newListHandler(deps))
|
||||
mux.HandleFunc("GET "+getRuntimePath, newGetHandler(deps))
|
||||
mux.HandleFunc("POST "+startRuntimePath, newStartHandler(deps))
|
||||
mux.HandleFunc("POST "+stopRuntimePath, newStopHandler(deps))
|
||||
mux.HandleFunc("POST "+restartRuntimePath, newRestartHandler(deps))
|
||||
mux.HandleFunc("POST "+patchRuntimePath, newPatchHandler(deps))
|
||||
mux.HandleFunc("DELETE "+cleanupRuntimePath, newCleanupHandler(deps))
|
||||
}
|
||||
@@ -0,0 +1,610 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"galaxy/rtmanager/internal/api/internalhttp/handlers/mocks"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// Tests for the mutating handlers (start, stop, restart, patch,
|
||||
// cleanup). Each handler delegates to one lifecycle service through a
|
||||
// narrow `mockgen`-backed interface; the handler layer is responsible
|
||||
// for input parsing, the `X-Galaxy-Caller` → `op_source` mapping, and
|
||||
// the canonical `ErrorCode` → HTTP status table documented in
|
||||
// `rtmanager/docs/services.md` §18.
|
||||
|
||||
// --- start ---
|
||||
|
||||
func TestStartHandlerReturnsRecordOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
record := sampleRunningRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
|
||||
assert.Equal(t, "game-test", in.GameID)
|
||||
assert.Equal(t, "galaxy/game:v1.2.3", in.ImageRef)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
|
||||
return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
deps := Dependencies{StartRuntime: mock}
|
||||
rec := drive(t, deps, http.MethodPost, "/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "game-test", resp.GameID)
|
||||
assert.Equal(t, "running", resp.Status)
|
||||
}
|
||||
|
||||
func TestStartHandlerReturnsRecordOnReplayNoOp(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
record := sampleRunningRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.Any()).
|
||||
Return(startruntime.Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "game-test", resp.GameID)
|
||||
}
|
||||
|
||||
func TestStartHandlerMapsServiceFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
errorCode string
|
||||
wantStatus int
|
||||
}{
|
||||
{"start_config_invalid", startruntime.ErrorCodeStartConfigInvalid, http.StatusBadRequest},
|
||||
{"image_pull_failed", startruntime.ErrorCodeImagePullFailed, http.StatusInternalServerError},
|
||||
{"container_start_failed", startruntime.ErrorCodeContainerStartFailed, http.StatusInternalServerError},
|
||||
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
|
||||
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
|
||||
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.Any()).
|
||||
Return(startruntime.Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: tc.errorCode,
|
||||
ErrorMessage: "synthetic " + tc.name,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
|
||||
body := decodeErrorBody(t, rec, tc.wantStatus)
|
||||
assert.Equal(t, tc.errorCode, body.Code)
|
||||
assert.Equal(t, "synthetic "+tc.name, body.Message)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStartHandlerRejectsUnknownJSONFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"x","extra":"y"}`),
|
||||
)
|
||||
|
||||
body := decodeErrorBody(t, rec, http.StatusBadRequest)
|
||||
assert.Equal(t, "invalid_request", body.Code)
|
||||
}
|
||||
|
||||
func TestStartHandlerRejectsMalformedJSON(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":`),
|
||||
)
|
||||
|
||||
body := decodeErrorBody(t, rec, http.StatusBadRequest)
|
||||
assert.Equal(t, "invalid_request", body.Code)
|
||||
}
|
||||
|
||||
func TestStartHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
header string
|
||||
want operation.OpSource
|
||||
hdrLabel string
|
||||
}{
|
||||
{"gm", operation.OpSourceGMRest, "gm"},
|
||||
{"GM", operation.OpSourceGMRest, "uppercase gm"},
|
||||
{"admin", operation.OpSourceAdminRest, "admin"},
|
||||
{"unknown", operation.OpSourceAdminRest, "unknown value"},
|
||||
{"", operation.OpSourceAdminRest, "missing header"},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.hdrLabel, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
record := sampleRunningRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
|
||||
assert.Equal(t, tc.want, in.OpSource)
|
||||
return startruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
headers := jsonHeaders()
|
||||
if tc.header != "" {
|
||||
headers = withCaller(headers, tc.header)
|
||||
}
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
headers,
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStartHandlerForwardsXRequestIDAsSourceRef(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(startruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in startruntime.Input) (startruntime.Result, error) {
|
||||
assert.Equal(t, "req-42", in.SourceRef)
|
||||
return startruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
headers := jsonHeaders()
|
||||
headers.Set("X-Request-ID", "req-42")
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
headers,
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
}
|
||||
|
||||
func TestStartHandlerReturnsInternalErrorWhenServiceErrors(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStartService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.Any()).
|
||||
Return(startruntime.Result{}, assert.AnError)
|
||||
|
||||
rec := drive(t, Dependencies{StartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
func TestStartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/start",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.3"}`),
|
||||
)
|
||||
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
// --- stop ---
|
||||
|
||||
func TestStopHandlerReturnsRecordOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStopService(ctrl)
|
||||
|
||||
record := sampleStoppedRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) {
|
||||
assert.Equal(t, "game-test", in.GameID)
|
||||
assert.Equal(t, stopruntime.StopReasonAdminRequest, in.Reason)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
|
||||
return stopruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/stop",
|
||||
jsonHeaders(),
|
||||
strReader(`{"reason":"admin_request"}`),
|
||||
)
|
||||
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "stopped", resp.Status)
|
||||
}
|
||||
|
||||
func TestStopHandlerMapsServiceFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
errorCode string
|
||||
wantStatus int
|
||||
}{
|
||||
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
|
||||
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
|
||||
{"invalid_request", startruntime.ErrorCodeInvalidRequest, http.StatusBadRequest},
|
||||
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
|
||||
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStopService(ctrl)
|
||||
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(stopruntime.Result{
|
||||
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/stop",
|
||||
jsonHeaders(),
|
||||
strReader(`{"reason":"admin_request"}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, tc.wantStatus)
|
||||
assert.Equal(t, tc.errorCode, body.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStopHandlerRejectsUnknownJSONFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStopService(ctrl)
|
||||
|
||||
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/stop",
|
||||
jsonHeaders(),
|
||||
strReader(`{"reason":"admin_request","extra":1}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusBadRequest)
|
||||
assert.Equal(t, "invalid_request", body.Code)
|
||||
}
|
||||
|
||||
func TestStopHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockStopService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(stopruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in stopruntime.Input) (stopruntime.Result, error) {
|
||||
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
|
||||
return stopruntime.Result{Record: sampleStoppedRecord(t), Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{StopRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/stop",
|
||||
withCaller(jsonHeaders(), "gm"),
|
||||
strReader(`{"reason":"cancelled"}`),
|
||||
)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
}
|
||||
|
||||
func TestStopHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/stop",
|
||||
jsonHeaders(),
|
||||
strReader(`{"reason":"admin_request"}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
// --- restart ---
|
||||
|
||||
func TestRestartHandlerReturnsRecordOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockRestartService(ctrl)
|
||||
|
||||
record := sampleRunningRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) {
|
||||
assert.Equal(t, "game-test", in.GameID)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
|
||||
return restartruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
|
||||
)
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "running", resp.Status)
|
||||
}
|
||||
|
||||
func TestRestartHandlerMapsServiceFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
errorCode string
|
||||
wantStatus int
|
||||
}{
|
||||
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
|
||||
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
|
||||
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
|
||||
{"internal_error", startruntime.ErrorCodeInternal, http.StatusInternalServerError},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockRestartService(ctrl)
|
||||
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(restartruntime.Result{
|
||||
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
|
||||
)
|
||||
body := decodeErrorBody(t, rec, tc.wantStatus)
|
||||
assert.Equal(t, tc.errorCode, body.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRestartHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockRestartService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(restartruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in restartruntime.Input) (restartruntime.Result, error) {
|
||||
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
|
||||
return restartruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{RestartRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/restart",
|
||||
withCaller(http.Header{}, "gm"), nil,
|
||||
)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
}
|
||||
|
||||
func TestRestartHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/restart", nil, nil,
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
// --- patch ---
|
||||
|
||||
func TestPatchHandlerReturnsRecordOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockPatchService(ctrl)
|
||||
|
||||
record := sampleRunningRecord(t)
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
|
||||
assert.Equal(t, "game-test", in.GameID)
|
||||
assert.Equal(t, "galaxy/game:v1.2.4", in.NewImageRef)
|
||||
return patchruntime.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/patch",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
|
||||
)
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "running", resp.Status)
|
||||
}
|
||||
|
||||
func TestPatchHandlerMapsServiceFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
errorCode string
|
||||
wantStatus int
|
||||
}{
|
||||
{"image_ref_not_semver", startruntime.ErrorCodeImageRefNotSemver, http.StatusBadRequest},
|
||||
{"semver_patch_only", startruntime.ErrorCodeSemverPatchOnly, http.StatusConflict},
|
||||
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
|
||||
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
|
||||
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockPatchService(ctrl)
|
||||
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(patchruntime.Result{
|
||||
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/patch",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, tc.wantStatus)
|
||||
assert.Equal(t, tc.errorCode, body.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchHandlerRejectsUnknownJSONFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockPatchService(ctrl)
|
||||
|
||||
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/patch",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"x","unexpected":true}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusBadRequest)
|
||||
assert.Equal(t, "invalid_request", body.Code)
|
||||
}
|
||||
|
||||
func TestPatchHandlerHonoursXGalaxyCallerHeader(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockPatchService(ctrl)
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(patchruntime.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in patchruntime.Input) (patchruntime.Result, error) {
|
||||
assert.Equal(t, operation.OpSourceGMRest, in.OpSource)
|
||||
return patchruntime.Result{Record: sampleRunningRecord(t), Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{PatchRuntime: mock}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/patch",
|
||||
withCaller(jsonHeaders(), "gm"),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
|
||||
)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
}
|
||||
|
||||
func TestPatchHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodPost,
|
||||
"/api/v1/internal/runtimes/game-test/patch",
|
||||
jsonHeaders(),
|
||||
strReader(`{"image_ref":"galaxy/game:v1.2.4"}`),
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
// --- cleanup ---
|
||||
|
||||
func TestCleanupHandlerReturnsRecordOnSuccess(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockCleanupService(ctrl)
|
||||
|
||||
record := sampleStoppedRecord(t)
|
||||
record.Status = runtime.StatusRemoved
|
||||
record.CurrentContainerID = ""
|
||||
removed := record.LastOpAt
|
||||
record.RemovedAt = &removed
|
||||
|
||||
mock.EXPECT().
|
||||
Handle(gomock.Any(), gomock.AssignableToTypeOf(cleanupcontainer.Input{})).
|
||||
DoAndReturn(func(_ context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) {
|
||||
assert.Equal(t, "game-stopped", in.GameID)
|
||||
assert.Equal(t, operation.OpSourceAdminRest, in.OpSource)
|
||||
return cleanupcontainer.Result{Record: record, Outcome: operation.OutcomeSuccess}, nil
|
||||
})
|
||||
|
||||
rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete,
|
||||
"/api/v1/internal/runtimes/game-stopped/container", nil, nil,
|
||||
)
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "removed", resp.Status)
|
||||
assert.Nil(t, resp.CurrentContainerID, "container id must be null after cleanup")
|
||||
}
|
||||
|
||||
func TestCleanupHandlerMapsServiceFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
errorCode string
|
||||
wantStatus int
|
||||
}{
|
||||
{"not_found", startruntime.ErrorCodeNotFound, http.StatusNotFound},
|
||||
{"conflict", startruntime.ErrorCodeConflict, http.StatusConflict},
|
||||
{"service_unavailable", startruntime.ErrorCodeServiceUnavailable, http.StatusServiceUnavailable},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
ctrl := gomock.NewController(t)
|
||||
mock := mocks.NewMockCleanupService(ctrl)
|
||||
mock.EXPECT().Handle(gomock.Any(), gomock.Any()).Return(cleanupcontainer.Result{
|
||||
Outcome: operation.OutcomeFailure, ErrorCode: tc.errorCode, ErrorMessage: tc.name,
|
||||
}, nil)
|
||||
|
||||
rec := drive(t, Dependencies{CleanupContainer: mock}, http.MethodDelete,
|
||||
"/api/v1/internal/runtimes/game-test/container", nil, nil,
|
||||
)
|
||||
body := decodeErrorBody(t, rec, tc.wantStatus)
|
||||
assert.Equal(t, tc.errorCode, body.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanupHandlerReturnsInternalErrorWhenServiceNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodDelete,
|
||||
"/api/v1/internal/runtimes/game-test/container", nil, nil,
|
||||
)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// Tests for the read-only handlers (`internalListRuntimes`,
|
||||
// `internalGetRuntime`). These bypass the service layer and read
|
||||
// directly from `ports.RuntimeRecordStore` — see
|
||||
// `rtmanager/docs/services.md` §18.
|
||||
|
||||
func TestListHandlerReturnsEmptyItemsForEmptyStore(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
deps := Dependencies{RuntimeRecords: newFakeRuntimeRecords()}
|
||||
rec := drive(t, deps, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
require.Equal(t, JSONContentType, rec.Header().Get("Content-Type"))
|
||||
|
||||
var resp runtimesListResponse
|
||||
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
|
||||
require.NotNil(t, resp.Items, "items must never be nil")
|
||||
assert.Empty(t, resp.Items)
|
||||
}
|
||||
|
||||
func TestListHandlerReturnsEveryStoredRecord(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
store := newFakeRuntimeRecords()
|
||||
store.put(sampleRunningRecord(t))
|
||||
store.put(sampleStoppedRecord(t))
|
||||
|
||||
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
|
||||
var resp runtimesListResponse
|
||||
require.NoError(t, json.NewDecoder(rec.Body).Decode(&resp))
|
||||
require.Len(t, resp.Items, 2)
|
||||
|
||||
gotIDs := map[string]string{}
|
||||
for _, item := range resp.Items {
|
||||
gotIDs[item.GameID] = item.Status
|
||||
}
|
||||
assert.Equal(t, "running", gotIDs["game-test"])
|
||||
assert.Equal(t, "stopped", gotIDs["game-stopped"])
|
||||
}
|
||||
|
||||
func TestListHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
store := newFakeRuntimeRecords()
|
||||
store.listErr = errors.New("postgres exploded")
|
||||
|
||||
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
func TestListHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes", nil, nil)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
func TestGetHandlerReturnsTheRecord(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
store := newFakeRuntimeRecords()
|
||||
record := sampleRunningRecord(t)
|
||||
store.put(record)
|
||||
|
||||
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
|
||||
resp := decodeRecordResponse(t, rec)
|
||||
assert.Equal(t, "game-test", resp.GameID)
|
||||
assert.Equal(t, "running", resp.Status)
|
||||
if assert.NotNil(t, resp.CurrentImageRef) {
|
||||
assert.Equal(t, "galaxy/game:v1.2.3", *resp.CurrentImageRef)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetHandlerReturnsNotFoundForMissingRecord(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{RuntimeRecords: newFakeRuntimeRecords()}, http.MethodGet, "/api/v1/internal/runtimes/game-missing", nil, nil)
|
||||
body := decodeErrorBody(t, rec, http.StatusNotFound)
|
||||
assert.Equal(t, "not_found", body.Code)
|
||||
}
|
||||
|
||||
func TestGetHandlerReturnsInternalErrorWhenStoreFails(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
store := newFakeRuntimeRecords()
|
||||
store.getErr = errors.New("transport blew up")
|
||||
|
||||
rec := drive(t, Dependencies{RuntimeRecords: store}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
|
||||
func TestGetHandlerReturnsInternalErrorWhenStoreNotWired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
rec := drive(t, Dependencies{}, http.MethodGet, "/api/v1/internal/runtimes/game-test", nil, nil)
|
||||
body := decodeErrorBody(t, rec, http.StatusInternalServerError)
|
||||
assert.Equal(t, "internal_error", body.Code)
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// newListHandler returns the handler for `GET /api/v1/internal/runtimes`.
|
||||
// The handler reads directly from `ports.RuntimeRecordStore.List` —
|
||||
// this surface is read-only and does not produce operation_log rows
|
||||
// (rationale: see `rtmanager/docs/services.md` §18).
|
||||
func newListHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.list")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.RuntimeRecords == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"runtime records store is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
records, err := deps.RuntimeRecords.List(request.Context())
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "list runtime records",
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"failed to list runtime records",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimesList(records))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
// Code generated by MockGen. DO NOT EDIT.
|
||||
// Source: galaxy/rtmanager/internal/api/internalhttp/handlers (interfaces: StartService,StopService,RestartService,PatchService,CleanupService)
|
||||
//
|
||||
// Generated by this command:
|
||||
//
|
||||
// mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService
|
||||
//
|
||||
|
||||
// Package mocks is a generated GoMock package.
|
||||
package mocks
|
||||
|
||||
import (
|
||||
context "context"
|
||||
cleanupcontainer "galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
patchruntime "galaxy/rtmanager/internal/service/patchruntime"
|
||||
restartruntime "galaxy/rtmanager/internal/service/restartruntime"
|
||||
startruntime "galaxy/rtmanager/internal/service/startruntime"
|
||||
stopruntime "galaxy/rtmanager/internal/service/stopruntime"
|
||||
reflect "reflect"
|
||||
|
||||
gomock "go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// MockStartService is a mock of StartService interface.
|
||||
type MockStartService struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockStartServiceMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockStartServiceMockRecorder is the mock recorder for MockStartService.
|
||||
type MockStartServiceMockRecorder struct {
|
||||
mock *MockStartService
|
||||
}
|
||||
|
||||
// NewMockStartService creates a new mock instance.
|
||||
func NewMockStartService(ctrl *gomock.Controller) *MockStartService {
|
||||
mock := &MockStartService{ctrl: ctrl}
|
||||
mock.recorder = &MockStartServiceMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockStartService) EXPECT() *MockStartServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// Handle mocks base method.
|
||||
func (m *MockStartService) Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Handle", ctx, in)
|
||||
ret0, _ := ret[0].(startruntime.Result)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Handle indicates an expected call of Handle.
|
||||
func (mr *MockStartServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStartService)(nil).Handle), ctx, in)
|
||||
}
|
||||
|
||||
// MockStopService is a mock of StopService interface.
|
||||
type MockStopService struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockStopServiceMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockStopServiceMockRecorder is the mock recorder for MockStopService.
|
||||
type MockStopServiceMockRecorder struct {
|
||||
mock *MockStopService
|
||||
}
|
||||
|
||||
// NewMockStopService creates a new mock instance.
|
||||
func NewMockStopService(ctrl *gomock.Controller) *MockStopService {
|
||||
mock := &MockStopService{ctrl: ctrl}
|
||||
mock.recorder = &MockStopServiceMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockStopService) EXPECT() *MockStopServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// Handle mocks base method.
|
||||
func (m *MockStopService) Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Handle", ctx, in)
|
||||
ret0, _ := ret[0].(stopruntime.Result)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Handle indicates an expected call of Handle.
|
||||
func (mr *MockStopServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockStopService)(nil).Handle), ctx, in)
|
||||
}
|
||||
|
||||
// MockRestartService is a mock of RestartService interface.
|
||||
type MockRestartService struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockRestartServiceMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockRestartServiceMockRecorder is the mock recorder for MockRestartService.
|
||||
type MockRestartServiceMockRecorder struct {
|
||||
mock *MockRestartService
|
||||
}
|
||||
|
||||
// NewMockRestartService creates a new mock instance.
|
||||
func NewMockRestartService(ctrl *gomock.Controller) *MockRestartService {
|
||||
mock := &MockRestartService{ctrl: ctrl}
|
||||
mock.recorder = &MockRestartServiceMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockRestartService) EXPECT() *MockRestartServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// Handle mocks base method.
|
||||
func (m *MockRestartService) Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Handle", ctx, in)
|
||||
ret0, _ := ret[0].(restartruntime.Result)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Handle indicates an expected call of Handle.
|
||||
func (mr *MockRestartServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockRestartService)(nil).Handle), ctx, in)
|
||||
}
|
||||
|
||||
// MockPatchService is a mock of PatchService interface.
|
||||
type MockPatchService struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockPatchServiceMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockPatchServiceMockRecorder is the mock recorder for MockPatchService.
|
||||
type MockPatchServiceMockRecorder struct {
|
||||
mock *MockPatchService
|
||||
}
|
||||
|
||||
// NewMockPatchService creates a new mock instance.
|
||||
func NewMockPatchService(ctrl *gomock.Controller) *MockPatchService {
|
||||
mock := &MockPatchService{ctrl: ctrl}
|
||||
mock.recorder = &MockPatchServiceMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockPatchService) EXPECT() *MockPatchServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// Handle mocks base method.
|
||||
func (m *MockPatchService) Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Handle", ctx, in)
|
||||
ret0, _ := ret[0].(patchruntime.Result)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Handle indicates an expected call of Handle.
|
||||
func (mr *MockPatchServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockPatchService)(nil).Handle), ctx, in)
|
||||
}
|
||||
|
||||
// MockCleanupService is a mock of CleanupService interface.
|
||||
type MockCleanupService struct {
|
||||
ctrl *gomock.Controller
|
||||
recorder *MockCleanupServiceMockRecorder
|
||||
isgomock struct{}
|
||||
}
|
||||
|
||||
// MockCleanupServiceMockRecorder is the mock recorder for MockCleanupService.
|
||||
type MockCleanupServiceMockRecorder struct {
|
||||
mock *MockCleanupService
|
||||
}
|
||||
|
||||
// NewMockCleanupService creates a new mock instance.
|
||||
func NewMockCleanupService(ctrl *gomock.Controller) *MockCleanupService {
|
||||
mock := &MockCleanupService{ctrl: ctrl}
|
||||
mock.recorder = &MockCleanupServiceMockRecorder{mock}
|
||||
return mock
|
||||
}
|
||||
|
||||
// EXPECT returns an object that allows the caller to indicate expected use.
|
||||
func (m *MockCleanupService) EXPECT() *MockCleanupServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// Handle mocks base method.
|
||||
func (m *MockCleanupService) Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "Handle", ctx, in)
|
||||
ret0, _ := ret[0].(cleanupcontainer.Result)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// Handle indicates an expected call of Handle.
|
||||
func (mr *MockCleanupServiceMockRecorder) Handle(ctx, in any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Handle", reflect.TypeOf((*MockCleanupService)(nil).Handle), ctx, in)
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// patchRequestBody mirrors the OpenAPI PatchRequest schema. The
|
||||
// service layer validates `image_ref` shape (semver, distribution
|
||||
// reference) and surfaces `image_ref_not_semver` /
|
||||
// `semver_patch_only` as needed.
|
||||
type patchRequestBody struct {
|
||||
ImageRef string `json:"image_ref"`
|
||||
}
|
||||
|
||||
// newPatchHandler returns the handler for
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/patch`.
|
||||
func newPatchHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.patch")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.PatchRuntime == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"patch runtime service is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
var body patchRequestBody
|
||||
if err := decodeStrictJSON(request.Body, &body); err != nil {
|
||||
writeError(writer, http.StatusBadRequest,
|
||||
startruntime.ErrorCodeInvalidRequest,
|
||||
err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
result, err := deps.PatchRuntime.Handle(request.Context(), patchruntime.Input{
|
||||
GameID: gameID,
|
||||
NewImageRef: body.ImageRef,
|
||||
OpSource: resolveOpSource(request),
|
||||
SourceRef: requestSourceRef(request),
|
||||
})
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "patch runtime service errored",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"patch runtime service failed",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// newRestartHandler returns the handler for
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/restart`. The OpenAPI spec
|
||||
// declares no request body for this operation; any client-provided
|
||||
// body is ignored.
|
||||
func newRestartHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.restart")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.RestartRuntime == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"restart runtime service is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
result, err := deps.RestartRuntime.Handle(request.Context(), restartruntime.Input{
|
||||
GameID: gameID,
|
||||
OpSource: resolveOpSource(request),
|
||||
SourceRef: requestSourceRef(request),
|
||||
})
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "restart runtime service errored",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"restart runtime service failed",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
// Package handlers ships the GM/Admin-facing internal REST surface of
|
||||
// Runtime Manager. The package is consumed by
|
||||
// `galaxy/rtmanager/internal/api/internalhttp`; each handler delegates
|
||||
// to one of the lifecycle services in `internal/service/`
|
||||
// (`startruntime`, `stopruntime`, `restartruntime`, `patchruntime`,
|
||||
// `cleanupcontainer`) or reads directly from `ports.RuntimeRecordStore`
|
||||
// (list / get).
|
||||
//
|
||||
// The interfaces declared in this file mirror the single `Handle`
|
||||
// method exposed by every concrete lifecycle service. Production wiring
|
||||
// passes the concrete service pointers; tests pass `mockgen`-generated
|
||||
// mocks. The narrow shape keeps the handler layer free of service
|
||||
// internals (lease tokens, telemetry, durable side effects) and matches
|
||||
// the repo-wide `mockgen` convention for wide / recorder ports.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
)
|
||||
|
||||
//go:generate go run go.uber.org/mock/mockgen -destination=mocks/mock_services.go -package=mocks galaxy/rtmanager/internal/api/internalhttp/handlers StartService,StopService,RestartService,PatchService,CleanupService
|
||||
|
||||
// StartService is the narrow port the start handler depends on. It
|
||||
// matches the public Handle method of `startruntime.Service`; the
|
||||
// concrete service satisfies the interface implicitly.
|
||||
type StartService interface {
|
||||
Handle(ctx context.Context, in startruntime.Input) (startruntime.Result, error)
|
||||
}
|
||||
|
||||
// StopService is the narrow port the stop handler depends on.
|
||||
type StopService interface {
|
||||
Handle(ctx context.Context, in stopruntime.Input) (stopruntime.Result, error)
|
||||
}
|
||||
|
||||
// RestartService is the narrow port the restart handler depends on.
|
||||
type RestartService interface {
|
||||
Handle(ctx context.Context, in restartruntime.Input) (restartruntime.Result, error)
|
||||
}
|
||||
|
||||
// PatchService is the narrow port the patch handler depends on.
|
||||
type PatchService interface {
|
||||
Handle(ctx context.Context, in patchruntime.Input) (patchruntime.Result, error)
|
||||
}
|
||||
|
||||
// CleanupService is the narrow port the cleanup handler depends on.
|
||||
type CleanupService interface {
|
||||
Handle(ctx context.Context, in cleanupcontainer.Input) (cleanupcontainer.Result, error)
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
)
|
||||
|
||||
// startRequestBody mirrors the OpenAPI StartRequest schema. Only
|
||||
// `image_ref` is accepted; unknown fields are rejected by
|
||||
// decodeStrictJSON.
|
||||
type startRequestBody struct {
|
||||
ImageRef string `json:"image_ref"`
|
||||
}
|
||||
|
||||
// newStartHandler returns the handler for
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/start`. The handler
|
||||
// delegates the entire lifecycle to `startruntime.Service`; failure
|
||||
// codes are mapped to HTTP statuses via mapErrorCodeToStatus.
|
||||
func newStartHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.start")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.StartRuntime == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"start runtime service is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
var body startRequestBody
|
||||
if err := decodeStrictJSON(request.Body, &body); err != nil {
|
||||
writeError(writer, http.StatusBadRequest,
|
||||
startruntime.ErrorCodeInvalidRequest,
|
||||
err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
result, err := deps.StartRuntime.Handle(request.Context(), startruntime.Input{
|
||||
GameID: gameID,
|
||||
ImageRef: body.ImageRef,
|
||||
OpSource: resolveOpSource(request),
|
||||
SourceRef: requestSourceRef(request),
|
||||
})
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "start runtime service errored",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"start runtime service failed",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
)
|
||||
|
||||
// stopRequestBody mirrors the OpenAPI StopRequest schema. The reason
|
||||
// enum is validated at the service layer (`stopruntime.Input.Validate`);
|
||||
// unknown values surface as `invalid_request`.
|
||||
type stopRequestBody struct {
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
// newStopHandler returns the handler for
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/stop`.
|
||||
func newStopHandler(deps Dependencies) http.HandlerFunc {
|
||||
logger := loggerFor(deps.Logger, "internal_rest.stop")
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if deps.StopRuntime == nil {
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"stop runtime service is not wired",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
gameID, ok := extractGameID(writer, request)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
var body stopRequestBody
|
||||
if err := decodeStrictJSON(request.Body, &body); err != nil {
|
||||
writeError(writer, http.StatusBadRequest,
|
||||
startruntime.ErrorCodeInvalidRequest,
|
||||
err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
result, err := deps.StopRuntime.Handle(request.Context(), stopruntime.Input{
|
||||
GameID: gameID,
|
||||
Reason: stopruntime.StopReason(body.Reason),
|
||||
OpSource: resolveOpSource(request),
|
||||
SourceRef: requestSourceRef(request),
|
||||
})
|
||||
if err != nil {
|
||||
logger.ErrorContext(request.Context(), "stop runtime service errored",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeError(writer, http.StatusInternalServerError,
|
||||
startruntime.ErrorCodeInternal,
|
||||
"stop runtime service failed",
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if result.Outcome == operation.OutcomeFailure {
|
||||
writeFailure(writer, result.ErrorCode, result.ErrorMessage)
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(writer, http.StatusOK, encodeRuntimeRecord(result.Record))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
// Package internalhttp provides the trusted internal HTTP listener used
|
||||
// by the runnable Runtime Manager process. It exposes `/healthz` and
|
||||
// `/readyz` plus the GM/Admin REST surface backed by the lifecycle
|
||||
// services in `internal/service/`.
|
||||
package internalhttp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/api/internalhttp/handlers"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
const jsonContentType = "application/json; charset=utf-8"
|
||||
|
||||
// errorCodeServiceUnavailable mirrors the stable error code declared in
|
||||
// `rtmanager/api/internal-openapi.yaml` `§Error Model`.
|
||||
const errorCodeServiceUnavailable = "service_unavailable"
|
||||
|
||||
// HealthzPath and ReadyzPath are the internal probe routes documented in
|
||||
// `rtmanager/api/internal-openapi.yaml`.
|
||||
const (
|
||||
HealthzPath = "/healthz"
|
||||
ReadyzPath = "/readyz"
|
||||
)
|
||||
|
||||
// ReadinessProbe reports whether the dependencies the listener guards
|
||||
// (PostgreSQL, Redis, Docker) are reachable. A non-nil error is reported
|
||||
// to the caller as `503 service_unavailable` with the wrapped message.
|
||||
type ReadinessProbe interface {
|
||||
Check(ctx context.Context) error
|
||||
}
|
||||
|
||||
// Config describes the trusted internal HTTP listener owned by Runtime
|
||||
// Manager.
|
||||
type Config struct {
|
||||
// Addr is the TCP listen address used by the internal HTTP server.
|
||||
Addr string
|
||||
|
||||
// ReadHeaderTimeout bounds how long the listener may spend reading
|
||||
// request headers before the server rejects the connection.
|
||||
ReadHeaderTimeout time.Duration
|
||||
|
||||
// ReadTimeout bounds how long the listener may spend reading one
|
||||
// request.
|
||||
ReadTimeout time.Duration
|
||||
|
||||
// WriteTimeout bounds how long the listener may spend writing one
|
||||
// response.
|
||||
WriteTimeout time.Duration
|
||||
|
||||
// IdleTimeout bounds how long the listener keeps an idle keep-alive
|
||||
// connection open.
|
||||
IdleTimeout time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a usable internal HTTP listener
|
||||
// configuration.
|
||||
func (cfg Config) Validate() error {
|
||||
switch {
|
||||
case cfg.Addr == "":
|
||||
return errors.New("internal HTTP addr must not be empty")
|
||||
case cfg.ReadHeaderTimeout <= 0:
|
||||
return errors.New("internal HTTP read header timeout must be positive")
|
||||
case cfg.ReadTimeout <= 0:
|
||||
return errors.New("internal HTTP read timeout must be positive")
|
||||
case cfg.WriteTimeout <= 0:
|
||||
return errors.New("internal HTTP write timeout must be positive")
|
||||
case cfg.IdleTimeout <= 0:
|
||||
return errors.New("internal HTTP idle timeout must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Dependencies describes the collaborators used by the internal HTTP
|
||||
// transport layer. The listener still works when the lifecycle service
|
||||
// fields are zero — handlers register but each returns
|
||||
// `500 internal_error` until the runtime wires the real services.
|
||||
type Dependencies struct {
|
||||
// Logger writes structured listener lifecycle logs. When nil,
|
||||
// slog.Default is used.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Telemetry records low-cardinality probe metrics and lifecycle
|
||||
// events.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Readiness reports whether PG / Redis / Docker are reachable. A
|
||||
// nil readiness probe makes `/readyz` always answer `200`; the
|
||||
// runtime always supplies a real probe in production wiring.
|
||||
Readiness ReadinessProbe
|
||||
|
||||
// RuntimeRecords backs the read-only list/get handlers. When nil
|
||||
// those routes return `500 internal_error`.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// StartRuntime, StopRuntime, RestartRuntime, PatchRuntime, and
|
||||
// CleanupContainer back the lifecycle handlers. Each accepts a
|
||||
// narrow interface so tests can pass `mockgen`-generated mocks;
|
||||
// production wiring passes the concrete `*<lifecycle>.Service`
|
||||
// pointer.
|
||||
StartRuntime handlers.StartService
|
||||
StopRuntime handlers.StopService
|
||||
RestartRuntime handlers.RestartService
|
||||
PatchRuntime handlers.PatchService
|
||||
CleanupContainer handlers.CleanupService
|
||||
}
|
||||
|
||||
// Server owns the trusted internal HTTP listener exposed by Runtime
|
||||
// Manager.
|
||||
type Server struct {
|
||||
cfg Config
|
||||
|
||||
handler http.Handler
|
||||
logger *slog.Logger
|
||||
metrics *telemetry.Runtime
|
||||
|
||||
stateMu sync.RWMutex
|
||||
server *http.Server
|
||||
listener net.Listener
|
||||
}
|
||||
|
||||
// NewServer constructs one trusted internal HTTP server for cfg and deps.
|
||||
func NewServer(cfg Config, deps Dependencies) (*Server, error) {
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new internal HTTP server: %w", err)
|
||||
}
|
||||
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Server{
|
||||
cfg: cfg,
|
||||
handler: newHandler(deps, logger),
|
||||
logger: logger.With("component", "internal_http"),
|
||||
metrics: deps.Telemetry,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Addr returns the currently bound listener address after Run is called.
|
||||
// It returns an empty string if the server has not yet bound a listener.
|
||||
func (server *Server) Addr() string {
|
||||
server.stateMu.RLock()
|
||||
defer server.stateMu.RUnlock()
|
||||
if server.listener == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
return server.listener.Addr().String()
|
||||
}
|
||||
|
||||
// Run binds the configured listener and serves the internal HTTP surface
|
||||
// until Shutdown closes the server.
|
||||
func (server *Server) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run internal HTTP server: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
listener, err := net.Listen("tcp", server.cfg.Addr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run internal HTTP server: listen on %q: %w", server.cfg.Addr, err)
|
||||
}
|
||||
|
||||
httpServer := &http.Server{
|
||||
Handler: server.handler,
|
||||
ReadHeaderTimeout: server.cfg.ReadHeaderTimeout,
|
||||
ReadTimeout: server.cfg.ReadTimeout,
|
||||
WriteTimeout: server.cfg.WriteTimeout,
|
||||
IdleTimeout: server.cfg.IdleTimeout,
|
||||
}
|
||||
|
||||
server.stateMu.Lock()
|
||||
server.server = httpServer
|
||||
server.listener = listener
|
||||
server.stateMu.Unlock()
|
||||
|
||||
server.logger.Info("rtmanager internal HTTP server started", "addr", listener.Addr().String())
|
||||
|
||||
defer func() {
|
||||
server.stateMu.Lock()
|
||||
server.server = nil
|
||||
server.listener = nil
|
||||
server.stateMu.Unlock()
|
||||
}()
|
||||
|
||||
err = httpServer.Serve(listener)
|
||||
switch {
|
||||
case err == nil:
|
||||
return nil
|
||||
case errors.Is(err, http.ErrServerClosed):
|
||||
server.logger.Info("rtmanager internal HTTP server stopped")
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("run internal HTTP server: serve on %q: %w", server.cfg.Addr, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown gracefully stops the internal HTTP server within ctx.
|
||||
func (server *Server) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown internal HTTP server: nil context")
|
||||
}
|
||||
|
||||
server.stateMu.RLock()
|
||||
httpServer := server.server
|
||||
server.stateMu.RUnlock()
|
||||
|
||||
if httpServer == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := httpServer.Shutdown(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
return fmt.Errorf("shutdown internal HTTP server: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func newHandler(deps Dependencies, logger *slog.Logger) http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET "+HealthzPath, handleHealthz)
|
||||
mux.HandleFunc("GET "+ReadyzPath, handleReadyz(deps.Readiness, logger))
|
||||
|
||||
handlers.Register(mux, handlers.Dependencies{
|
||||
Logger: logger,
|
||||
RuntimeRecords: deps.RuntimeRecords,
|
||||
StartRuntime: deps.StartRuntime,
|
||||
StopRuntime: deps.StopRuntime,
|
||||
RestartRuntime: deps.RestartRuntime,
|
||||
PatchRuntime: deps.PatchRuntime,
|
||||
CleanupContainer: deps.CleanupContainer,
|
||||
})
|
||||
|
||||
metrics := deps.Telemetry
|
||||
options := []otelhttp.Option{}
|
||||
if metrics != nil {
|
||||
options = append(options,
|
||||
otelhttp.WithTracerProvider(metrics.TracerProvider()),
|
||||
otelhttp.WithMeterProvider(metrics.MeterProvider()),
|
||||
)
|
||||
}
|
||||
|
||||
return otelhttp.NewHandler(withObservability(mux, metrics), "rtmanager.internal_http", options...)
|
||||
}
|
||||
|
||||
func withObservability(next http.Handler, metrics *telemetry.Runtime) http.Handler {
|
||||
return http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {
|
||||
startedAt := time.Now()
|
||||
recorder := &statusRecorder{
|
||||
ResponseWriter: writer,
|
||||
statusCode: http.StatusOK,
|
||||
}
|
||||
|
||||
next.ServeHTTP(recorder, request)
|
||||
|
||||
route := request.Pattern
|
||||
switch recorder.statusCode {
|
||||
case http.StatusMethodNotAllowed:
|
||||
route = "method_not_allowed"
|
||||
case http.StatusNotFound:
|
||||
route = "not_found"
|
||||
case 0:
|
||||
route = "unmatched"
|
||||
}
|
||||
if route == "" {
|
||||
route = "unmatched"
|
||||
}
|
||||
|
||||
if metrics != nil {
|
||||
metrics.RecordInternalHTTPRequest(
|
||||
request.Context(),
|
||||
[]attribute.KeyValue{
|
||||
attribute.String("route", route),
|
||||
attribute.String("method", request.Method),
|
||||
attribute.String("status_code", strconv.Itoa(recorder.statusCode)),
|
||||
},
|
||||
time.Since(startedAt),
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func handleHealthz(writer http.ResponseWriter, _ *http.Request) {
|
||||
writeStatusResponse(writer, http.StatusOK, "ok")
|
||||
}
|
||||
|
||||
func handleReadyz(probe ReadinessProbe, logger *slog.Logger) http.HandlerFunc {
|
||||
return func(writer http.ResponseWriter, request *http.Request) {
|
||||
if probe == nil {
|
||||
writeStatusResponse(writer, http.StatusOK, "ready")
|
||||
return
|
||||
}
|
||||
|
||||
if err := probe.Check(request.Context()); err != nil {
|
||||
logger.WarnContext(request.Context(), "rtmanager readiness probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
writeServiceUnavailable(writer, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
writeStatusResponse(writer, http.StatusOK, "ready")
|
||||
}
|
||||
}
|
||||
|
||||
func writeStatusResponse(writer http.ResponseWriter, statusCode int, status string) {
|
||||
writer.Header().Set("Content-Type", jsonContentType)
|
||||
writer.WriteHeader(statusCode)
|
||||
_ = json.NewEncoder(writer).Encode(statusResponse{Status: status})
|
||||
}
|
||||
|
||||
func writeServiceUnavailable(writer http.ResponseWriter, message string) {
|
||||
writer.Header().Set("Content-Type", jsonContentType)
|
||||
writer.WriteHeader(http.StatusServiceUnavailable)
|
||||
_ = json.NewEncoder(writer).Encode(errorResponse{
|
||||
Error: errorBody{
|
||||
Code: errorCodeServiceUnavailable,
|
||||
Message: message,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
type statusResponse struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
type errorBody struct {
|
||||
Code string `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
type errorResponse struct {
|
||||
Error errorBody `json:"error"`
|
||||
}
|
||||
|
||||
type statusRecorder struct {
|
||||
http.ResponseWriter
|
||||
statusCode int
|
||||
}
|
||||
|
||||
func (recorder *statusRecorder) WriteHeader(statusCode int) {
|
||||
recorder.statusCode = statusCode
|
||||
recorder.ResponseWriter.WriteHeader(statusCode)
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
package internalhttp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newTestConfig() Config {
|
||||
return Config{
|
||||
Addr: ":0",
|
||||
ReadHeaderTimeout: time.Second,
|
||||
ReadTimeout: time.Second,
|
||||
WriteTimeout: time.Second,
|
||||
IdleTimeout: time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
type stubReadiness struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (probe stubReadiness) Check(_ context.Context) error {
|
||||
return probe.err
|
||||
}
|
||||
|
||||
func newTestServer(t *testing.T, deps Dependencies) http.Handler {
|
||||
t.Helper()
|
||||
server, err := NewServer(newTestConfig(), deps)
|
||||
require.NoError(t, err)
|
||||
return server.handler
|
||||
}
|
||||
|
||||
func TestHealthzReturnsOK(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
handler := newTestServer(t, Dependencies{})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, HealthzPath, nil)
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
require.Equal(t, jsonContentType, rec.Header().Get("Content-Type"))
|
||||
|
||||
var body statusResponse
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
|
||||
require.Equal(t, "ok", body.Status)
|
||||
}
|
||||
|
||||
func TestReadyzReturnsReadyWhenProbeIsNil(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
handler := newTestServer(t, Dependencies{})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
|
||||
var body statusResponse
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
|
||||
require.Equal(t, "ready", body.Status)
|
||||
}
|
||||
|
||||
func TestReadyzReturnsReadyWhenProbeSucceeds(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
handler := newTestServer(t, Dependencies{Readiness: stubReadiness{}})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
|
||||
var body statusResponse
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
|
||||
require.Equal(t, "ready", body.Status)
|
||||
}
|
||||
|
||||
func TestReadyzReturnsServiceUnavailableWhenProbeFails(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
handler := newTestServer(t, Dependencies{
|
||||
Readiness: stubReadiness{err: errors.New("postgres ping: connection refused")},
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, ReadyzPath, nil)
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusServiceUnavailable, rec.Code)
|
||||
require.Equal(t, jsonContentType, rec.Header().Get("Content-Type"))
|
||||
|
||||
var body errorResponse
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &body))
|
||||
require.Equal(t, errorCodeServiceUnavailable, body.Error.Code)
|
||||
require.True(t, strings.Contains(body.Error.Message, "postgres"))
|
||||
}
|
||||
|
||||
func TestNewServerRejectsInvalidConfig(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, err := NewServer(Config{}, Dependencies{})
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
// Package app wires the Runtime Manager process lifecycle and
|
||||
// coordinates component startup and graceful shutdown.
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
)
|
||||
|
||||
// Component is a long-lived Runtime Manager subsystem that participates
|
||||
// in coordinated startup and graceful shutdown.
|
||||
type Component interface {
|
||||
// Run starts the component and blocks until it stops.
|
||||
Run(context.Context) error
|
||||
|
||||
// Shutdown stops the component within the provided timeout-bounded
|
||||
// context.
|
||||
Shutdown(context.Context) error
|
||||
}
|
||||
|
||||
// App owns the process-level lifecycle of Runtime Manager and its
|
||||
// registered components.
|
||||
type App struct {
|
||||
cfg config.Config
|
||||
components []Component
|
||||
}
|
||||
|
||||
// New constructs App with a defensive copy of the supplied components.
|
||||
func New(cfg config.Config, components ...Component) *App {
|
||||
clonedComponents := append([]Component(nil), components...)
|
||||
|
||||
return &App{
|
||||
cfg: cfg,
|
||||
components: clonedComponents,
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts all configured components, waits for cancellation or the
|
||||
// first component failure, and then executes best-effort graceful
|
||||
// shutdown.
|
||||
func (app *App) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run rtmanager app: nil context")
|
||||
}
|
||||
if err := app.validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if len(app.components) == 0 {
|
||||
<-ctx.Done()
|
||||
return nil
|
||||
}
|
||||
|
||||
runCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
results := make(chan componentResult, len(app.components))
|
||||
var runWaitGroup sync.WaitGroup
|
||||
|
||||
for index, component := range app.components {
|
||||
runWaitGroup.Add(1)
|
||||
|
||||
go func(componentIndex int, component Component) {
|
||||
defer runWaitGroup.Done()
|
||||
results <- componentResult{
|
||||
index: componentIndex,
|
||||
err: component.Run(runCtx),
|
||||
}
|
||||
}(index, component)
|
||||
}
|
||||
|
||||
var runErr error
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case result := <-results:
|
||||
runErr = classifyComponentResult(ctx, result)
|
||||
}
|
||||
|
||||
cancel()
|
||||
|
||||
shutdownErr := app.shutdownComponents()
|
||||
waitErr := app.waitForComponents(&runWaitGroup)
|
||||
|
||||
return errors.Join(runErr, shutdownErr, waitErr)
|
||||
}
|
||||
|
||||
type componentResult struct {
|
||||
index int
|
||||
err error
|
||||
}
|
||||
|
||||
func (app *App) validate() error {
|
||||
if app.cfg.ShutdownTimeout <= 0 {
|
||||
return fmt.Errorf("run rtmanager app: shutdown timeout must be positive, got %s", app.cfg.ShutdownTimeout)
|
||||
}
|
||||
|
||||
for index, component := range app.components {
|
||||
if component == nil {
|
||||
return fmt.Errorf("run rtmanager app: component %d is nil", index)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func classifyComponentResult(parentCtx context.Context, result componentResult) error {
|
||||
switch {
|
||||
case result.err == nil:
|
||||
if parentCtx.Err() != nil {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("run rtmanager app: component %d exited without error before shutdown", result.index)
|
||||
case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("run rtmanager app: component %d: %w", result.index, result.err)
|
||||
}
|
||||
}
|
||||
|
||||
func (app *App) shutdownComponents() error {
|
||||
var shutdownWaitGroup sync.WaitGroup
|
||||
errs := make(chan error, len(app.components))
|
||||
|
||||
for index, component := range app.components {
|
||||
shutdownWaitGroup.Add(1)
|
||||
|
||||
go func(componentIndex int, component Component) {
|
||||
defer shutdownWaitGroup.Done()
|
||||
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
|
||||
defer cancel()
|
||||
|
||||
if err := component.Shutdown(shutdownCtx); err != nil {
|
||||
errs <- fmt.Errorf("shutdown rtmanager component %d: %w", componentIndex, err)
|
||||
}
|
||||
}(index, component)
|
||||
}
|
||||
|
||||
shutdownWaitGroup.Wait()
|
||||
close(errs)
|
||||
|
||||
var joined error
|
||||
for err := range errs {
|
||||
joined = errors.Join(joined, err)
|
||||
}
|
||||
|
||||
return joined
|
||||
}
|
||||
|
||||
func (app *App) waitForComponents(runWaitGroup *sync.WaitGroup) error {
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
runWaitGroup.Wait()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
waitCtx, cancel := context.WithTimeout(context.Background(), app.cfg.ShutdownTimeout)
|
||||
defer cancel()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-waitCtx.Done():
|
||||
return fmt.Errorf("wait for rtmanager components: %w", waitCtx.Err())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type fakeComponent struct {
|
||||
runErr error
|
||||
shutdownErr error
|
||||
runHook func(context.Context) error
|
||||
shutdownHook func(context.Context) error
|
||||
runCount atomic.Int32
|
||||
downCount atomic.Int32
|
||||
blockForCtx bool
|
||||
}
|
||||
|
||||
func (component *fakeComponent) Run(ctx context.Context) error {
|
||||
component.runCount.Add(1)
|
||||
if component.runHook != nil {
|
||||
return component.runHook(ctx)
|
||||
}
|
||||
if component.blockForCtx {
|
||||
<-ctx.Done()
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
return component.runErr
|
||||
}
|
||||
|
||||
func (component *fakeComponent) Shutdown(ctx context.Context) error {
|
||||
component.downCount.Add(1)
|
||||
if component.shutdownHook != nil {
|
||||
return component.shutdownHook(ctx)
|
||||
}
|
||||
|
||||
return component.shutdownErr
|
||||
}
|
||||
|
||||
func newCfg() config.Config {
|
||||
return config.Config{ShutdownTimeout: time.Second}
|
||||
}
|
||||
|
||||
func TestAppRunWithoutComponentsBlocksUntilContextDone(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
app := New(newCfg())
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
require.NoError(t, app.Run(ctx))
|
||||
}
|
||||
|
||||
func TestAppRunReturnsOnContextCancel(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
component := &fakeComponent{blockForCtx: true}
|
||||
app := New(newCfg(), component)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
require.NoError(t, app.Run(ctx))
|
||||
assert.EqualValues(t, 1, component.runCount.Load())
|
||||
assert.EqualValues(t, 1, component.downCount.Load())
|
||||
}
|
||||
|
||||
func TestAppRunPropagatesComponentFailure(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
failure := errors.New("boom")
|
||||
component := &fakeComponent{runErr: failure}
|
||||
app := New(newCfg(), component)
|
||||
|
||||
err := app.Run(context.Background())
|
||||
require.Error(t, err)
|
||||
require.ErrorIs(t, err, failure)
|
||||
assert.EqualValues(t, 1, component.downCount.Load())
|
||||
}
|
||||
|
||||
func TestAppRunFailsOnNilContext(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
app := New(newCfg())
|
||||
var ctx context.Context
|
||||
require.Error(t, app.Run(ctx))
|
||||
}
|
||||
|
||||
func TestAppRunFailsOnNonPositiveShutdownTimeout(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
app := New(config.Config{}, &fakeComponent{})
|
||||
require.Error(t, app.Run(context.Background()))
|
||||
}
|
||||
|
||||
func TestAppRunFailsOnNilComponent(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
app := New(newCfg(), nil)
|
||||
require.Error(t, app.Run(context.Background()))
|
||||
}
|
||||
|
||||
func TestAppRunFlagsCleanExitBeforeShutdown(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
component := &fakeComponent{}
|
||||
app := New(newCfg(), component)
|
||||
|
||||
err := app.Run(context.Background())
|
||||
require.Error(t, err)
|
||||
require.True(t, contains(err.Error(), "exited without error"))
|
||||
}
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || index(haystack, needle) >= 0))
|
||||
}
|
||||
|
||||
func index(haystack, needle string) int {
|
||||
for i := 0; i+len(needle) <= len(haystack); i++ {
|
||||
if haystack[i:i+len(needle)] == needle {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// newRedisClient builds the master Redis client from cfg via the shared
|
||||
// `pkg/redisconn` helper. Replica clients are not opened in this iteration
|
||||
// per ARCHITECTURE.md §Persistence Backends; they will be wired when read
|
||||
// routing is introduced.
|
||||
func newRedisClient(cfg config.RedisConfig) *redis.Client {
|
||||
return redisconn.NewMasterClient(cfg.Conn)
|
||||
}
|
||||
|
||||
// instrumentRedisClient attaches the OpenTelemetry tracing and metrics
|
||||
// instrumentation to client when telemetryRuntime is available. The
|
||||
// actual instrumentation lives in `pkg/redisconn` so every Galaxy service
|
||||
// shares one surface.
|
||||
func instrumentRedisClient(redisClient *redis.Client, telemetryRuntime *telemetry.Runtime) error {
|
||||
if redisClient == nil {
|
||||
return errors.New("instrument redis client: nil client")
|
||||
}
|
||||
if telemetryRuntime == nil {
|
||||
return nil
|
||||
}
|
||||
return redisconn.Instrument(redisClient,
|
||||
redisconn.WithTracerProvider(telemetryRuntime.TracerProvider()),
|
||||
redisconn.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
}
|
||||
|
||||
// pingRedis performs a single Redis PING bounded by
|
||||
// cfg.Conn.OperationTimeout to confirm that the configured Redis endpoint
|
||||
// is reachable at startup.
|
||||
func pingRedis(ctx context.Context, cfg config.RedisConfig, redisClient *redis.Client) error {
|
||||
return redisconn.Ping(ctx, redisClient, cfg.Conn.OperationTimeout)
|
||||
}
|
||||
|
||||
// newDockerClient constructs a Docker SDK client for cfg.Host with an
|
||||
// optional API version override. The bootstrap layer opens and pings
|
||||
// the client; the production Docker adapter wraps it for the service
|
||||
// layer.
|
||||
func newDockerClient(cfg config.DockerConfig) (*client.Client, error) {
|
||||
options := []client.Opt{client.WithHost(cfg.Host)}
|
||||
if cfg.APIVersion == "" {
|
||||
options = append(options, client.WithAPIVersionNegotiation())
|
||||
} else {
|
||||
options = append(options, client.WithVersion(cfg.APIVersion))
|
||||
}
|
||||
|
||||
docker, err := client.NewClientWithOpts(options...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new docker client: %w", err)
|
||||
}
|
||||
return docker, nil
|
||||
}
|
||||
|
||||
// pingDocker bounds one Docker daemon ping under timeout and returns a
|
||||
// wrapped error so startup failures are easy to spot in service logs.
|
||||
func pingDocker(ctx context.Context, dockerClient *client.Client, timeout time.Duration) error {
|
||||
if dockerClient == nil {
|
||||
return errors.New("ping docker: nil client")
|
||||
}
|
||||
if timeout <= 0 {
|
||||
return errors.New("ping docker: timeout must be positive")
|
||||
}
|
||||
|
||||
pingCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
if _, err := dockerClient.Ping(pingCtx); err != nil {
|
||||
return fmt.Errorf("ping docker: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newTestRedisCfg(addr string) config.RedisConfig {
|
||||
return config.RedisConfig{
|
||||
Conn: redisconn.Config{
|
||||
MasterAddr: addr,
|
||||
Password: "test",
|
||||
OperationTimeout: time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestPingRedisSucceedsAgainstMiniredis(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
|
||||
redisCfg := newTestRedisCfg(server.Addr())
|
||||
client := newRedisClient(redisCfg)
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
require.NoError(t, pingRedis(context.Background(), redisCfg, client))
|
||||
}
|
||||
|
||||
func TestPingRedisReturnsErrorWhenClosed(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
|
||||
redisCfg := newTestRedisCfg(server.Addr())
|
||||
client := newRedisClient(redisCfg)
|
||||
require.NoError(t, client.Close())
|
||||
|
||||
require.Error(t, pingRedis(context.Background(), redisCfg, client))
|
||||
}
|
||||
|
||||
func TestNewDockerClientHonoursHostOverride(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
docker, err := newDockerClient(config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
APIVersion: "1.43",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, docker)
|
||||
require.NoError(t, docker.Close())
|
||||
}
|
||||
|
||||
func TestPingDockerRejectsNilClient(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require.Error(t, pingDocker(context.Background(), nil, time.Second))
|
||||
}
|
||||
|
||||
func TestPingDockerRejectsNonPositiveTimeout(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
docker, err := newDockerClient(config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() { _ = docker.Close() })
|
||||
|
||||
require.Error(t, pingDocker(context.Background(), docker, 0))
|
||||
}
|
||||
@@ -0,0 +1,262 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/migrations"
|
||||
"galaxy/rtmanager/internal/api/internalhttp"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Runtime owns the runnable Runtime Manager process plus the cleanup
|
||||
// functions that release runtime resources after shutdown.
|
||||
type Runtime struct {
|
||||
cfg config.Config
|
||||
|
||||
app *App
|
||||
|
||||
wiring *wiring
|
||||
|
||||
internalServer *internalhttp.Server
|
||||
|
||||
cleanupFns []func() error
|
||||
}
|
||||
|
||||
// NewRuntime constructs the runnable Runtime Manager process from cfg.
|
||||
//
|
||||
// PostgreSQL migrations apply strictly before the internal HTTP listener
|
||||
// becomes ready. The runtime opens one shared `*redis.Client`, one
|
||||
// `*sql.DB`, one Docker SDK client, and one OpenTelemetry runtime; all
|
||||
// are released in reverse construction order on shutdown.
|
||||
func NewRuntime(ctx context.Context, cfg config.Config, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new rtmanager runtime: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager runtime: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
runtime := &Runtime{
|
||||
cfg: cfg,
|
||||
}
|
||||
|
||||
cleanupOnError := func(err error) (*Runtime, error) {
|
||||
if cleanupErr := runtime.Close(); cleanupErr != nil {
|
||||
return nil, fmt.Errorf("%w; cleanup: %w", err, cleanupErr)
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
telemetryRuntime, err := telemetry.NewProcess(ctx, telemetry.ProcessConfig{
|
||||
ServiceName: cfg.Telemetry.ServiceName,
|
||||
TracesExporter: cfg.Telemetry.TracesExporter,
|
||||
MetricsExporter: cfg.Telemetry.MetricsExporter,
|
||||
TracesProtocol: cfg.Telemetry.TracesProtocol,
|
||||
MetricsProtocol: cfg.Telemetry.MetricsProtocol,
|
||||
StdoutTracesEnabled: cfg.Telemetry.StdoutTracesEnabled,
|
||||
StdoutMetricsEnabled: cfg.Telemetry.StdoutMetricsEnabled,
|
||||
}, logger)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: telemetry: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), cfg.ShutdownTimeout)
|
||||
defer cancel()
|
||||
return telemetryRuntime.Shutdown(shutdownCtx)
|
||||
})
|
||||
|
||||
redisClient := newRedisClient(cfg.Redis)
|
||||
if err := instrumentRedisClient(redisClient, telemetryRuntime); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
err := redisClient.Close()
|
||||
if errors.Is(err, redis.ErrClosed) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
})
|
||||
if err := pingRedis(ctx, cfg.Redis, redisClient); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
|
||||
pgPool, err := postgres.OpenPrimary(ctx, cfg.Postgres.Conn,
|
||||
postgres.WithTracerProvider(telemetryRuntime.TracerProvider()),
|
||||
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: open postgres: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, pgPool.Close)
|
||||
unregisterPGStats, err := postgres.InstrumentDBStats(pgPool,
|
||||
postgres.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: instrument postgres: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, func() error {
|
||||
return unregisterPGStats()
|
||||
})
|
||||
if err := postgres.Ping(ctx, pgPool, cfg.Postgres.Conn.OperationTimeout); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: ping postgres: %w", err))
|
||||
}
|
||||
if err := postgres.RunMigrations(ctx, pgPool, migrations.FS(), "."); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: run postgres migrations: %w", err))
|
||||
}
|
||||
|
||||
dockerClient, err := newDockerClient(cfg.Docker)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, dockerClient.Close)
|
||||
if err := pingDocker(ctx, dockerClient, cfg.Postgres.Conn.OperationTimeout); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: %w", err))
|
||||
}
|
||||
|
||||
wiring, err := newWiring(cfg, redisClient, pgPool, dockerClient, time.Now, logger, telemetryRuntime)
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: wiring: %w", err))
|
||||
}
|
||||
runtime.wiring = wiring
|
||||
runtime.cleanupFns = append(runtime.cleanupFns, wiring.close)
|
||||
if err := wiring.registerTelemetryGauges(); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: register telemetry gauges: %w", err))
|
||||
}
|
||||
|
||||
if err := wiring.reconciler.ReconcileNow(ctx); err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: initial reconcile: %w", err))
|
||||
}
|
||||
|
||||
probe := newReadinessProbe(pgPool, redisClient, dockerClient, cfg)
|
||||
|
||||
internalServer, err := internalhttp.NewServer(internalhttp.Config{
|
||||
Addr: cfg.InternalHTTP.Addr,
|
||||
ReadHeaderTimeout: cfg.InternalHTTP.ReadHeaderTimeout,
|
||||
ReadTimeout: cfg.InternalHTTP.ReadTimeout,
|
||||
WriteTimeout: cfg.InternalHTTP.WriteTimeout,
|
||||
IdleTimeout: cfg.InternalHTTP.IdleTimeout,
|
||||
}, internalhttp.Dependencies{
|
||||
Logger: logger,
|
||||
Telemetry: telemetryRuntime,
|
||||
Readiness: probe,
|
||||
RuntimeRecords: wiring.runtimeRecordStore,
|
||||
StartRuntime: wiring.startRuntimeService,
|
||||
StopRuntime: wiring.stopRuntimeService,
|
||||
RestartRuntime: wiring.restartRuntimeService,
|
||||
PatchRuntime: wiring.patchRuntimeService,
|
||||
CleanupContainer: wiring.cleanupContainerService,
|
||||
})
|
||||
if err != nil {
|
||||
return cleanupOnError(fmt.Errorf("new rtmanager runtime: internal HTTP server: %w", err))
|
||||
}
|
||||
runtime.internalServer = internalServer
|
||||
|
||||
runtime.app = New(cfg,
|
||||
internalServer,
|
||||
wiring.startJobsConsumer,
|
||||
wiring.stopJobsConsumer,
|
||||
wiring.dockerEventsListener,
|
||||
wiring.healthProbeWorker,
|
||||
wiring.dockerInspectWorker,
|
||||
wiring.reconciler,
|
||||
wiring.containerCleanupWorker,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// InternalServer returns the internal HTTP server owned by runtime. It is
|
||||
// primarily exposed for tests; production code should not depend on it.
|
||||
func (runtime *Runtime) InternalServer() *internalhttp.Server {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return runtime.internalServer
|
||||
}
|
||||
|
||||
// Run serves the internal HTTP listener until ctx is canceled or one
|
||||
// component fails.
|
||||
func (runtime *Runtime) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run rtmanager runtime: nil context")
|
||||
}
|
||||
if runtime == nil {
|
||||
return errors.New("run rtmanager runtime: nil runtime")
|
||||
}
|
||||
if runtime.app == nil {
|
||||
return errors.New("run rtmanager runtime: nil app")
|
||||
}
|
||||
|
||||
return runtime.app.Run(ctx)
|
||||
}
|
||||
|
||||
// Close releases every runtime dependency in reverse construction order.
|
||||
// Close is safe to call multiple times.
|
||||
func (runtime *Runtime) Close() error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var joined error
|
||||
for index := len(runtime.cleanupFns) - 1; index >= 0; index-- {
|
||||
if err := runtime.cleanupFns[index](); err != nil {
|
||||
joined = errors.Join(joined, err)
|
||||
}
|
||||
}
|
||||
runtime.cleanupFns = nil
|
||||
|
||||
return joined
|
||||
}
|
||||
|
||||
// readinessProbe pings every steady-state dependency the listener
|
||||
// guards: PostgreSQL primary, Redis master, the Docker daemon, plus
|
||||
// the configured Docker network's existence.
|
||||
type readinessProbe struct {
|
||||
pgPool *sql.DB
|
||||
redisClient *redis.Client
|
||||
dockerClient *dockerclient.Client
|
||||
|
||||
postgresTimeout time.Duration
|
||||
redisTimeout time.Duration
|
||||
dockerTimeout time.Duration
|
||||
}
|
||||
|
||||
func newReadinessProbe(pgPool *sql.DB, redisClient *redis.Client, dockerClient *dockerclient.Client, cfg config.Config) *readinessProbe {
|
||||
return &readinessProbe{
|
||||
pgPool: pgPool,
|
||||
redisClient: redisClient,
|
||||
dockerClient: dockerClient,
|
||||
postgresTimeout: cfg.Postgres.Conn.OperationTimeout,
|
||||
redisTimeout: cfg.Redis.Conn.OperationTimeout,
|
||||
dockerTimeout: cfg.Postgres.Conn.OperationTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// Check pings PostgreSQL, Redis, and Docker. The first failing
|
||||
// dependency aborts the check so callers see a single, actionable
|
||||
// error.
|
||||
func (probe *readinessProbe) Check(ctx context.Context) error {
|
||||
if err := postgres.Ping(ctx, probe.pgPool, probe.postgresTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := redisconn.Ping(ctx, probe.redisClient, probe.redisTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
return pingDocker(ctx, probe.dockerClient, probe.dockerTimeout)
|
||||
}
|
||||
@@ -0,0 +1,541 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker"
|
||||
"galaxy/rtmanager/internal/adapters/healtheventspublisher"
|
||||
"galaxy/rtmanager/internal/adapters/jobresultspublisher"
|
||||
"galaxy/rtmanager/internal/adapters/lobbyclient"
|
||||
"galaxy/rtmanager/internal/adapters/notificationpublisher"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/healthsnapshotstore"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/operationlogstore"
|
||||
"galaxy/rtmanager/internal/adapters/postgres/runtimerecordstore"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/gamelease"
|
||||
"galaxy/rtmanager/internal/adapters/redisstate/streamoffsets"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
"galaxy/rtmanager/internal/worker/containercleanup"
|
||||
"galaxy/rtmanager/internal/worker/dockerevents"
|
||||
"galaxy/rtmanager/internal/worker/dockerinspect"
|
||||
"galaxy/rtmanager/internal/worker/healthprobe"
|
||||
"galaxy/rtmanager/internal/worker/reconcile"
|
||||
"galaxy/rtmanager/internal/worker/startjobsconsumer"
|
||||
"galaxy/rtmanager/internal/worker/stopjobsconsumer"
|
||||
|
||||
dockerclient "github.com/docker/docker/client"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
)
|
||||
|
||||
// wiring owns the process-level singletons constructed once during
|
||||
// `NewRuntime` and consumed by every worker and HTTP handler.
|
||||
//
|
||||
// The struct exposes typed accessors so callers can grab the store /
|
||||
// adapter / service singletons without depending on internal fields.
|
||||
type wiring struct {
|
||||
cfg config.Config
|
||||
|
||||
redisClient *redis.Client
|
||||
pgPool *sql.DB
|
||||
dockerClient *dockerclient.Client
|
||||
|
||||
clock func() time.Time
|
||||
|
||||
logger *slog.Logger
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
// Persistence stores.
|
||||
runtimeRecordStore *runtimerecordstore.Store
|
||||
operationLogStore *operationlogstore.Store
|
||||
healthSnapshotStore *healthsnapshotstore.Store
|
||||
streamOffsetStore *streamoffsets.Store
|
||||
gameLeaseStore *gamelease.Store
|
||||
|
||||
// External adapters.
|
||||
dockerAdapter *docker.Client
|
||||
lobbyClient *lobbyclient.Client
|
||||
notificationPublisher *notificationpublisher.Publisher
|
||||
healthEventsPublisher *healtheventspublisher.Publisher
|
||||
jobResultsPublisher *jobresultspublisher.Publisher
|
||||
|
||||
// Service layer.
|
||||
startRuntimeService *startruntime.Service
|
||||
stopRuntimeService *stopruntime.Service
|
||||
restartRuntimeService *restartruntime.Service
|
||||
patchRuntimeService *patchruntime.Service
|
||||
cleanupContainerService *cleanupcontainer.Service
|
||||
|
||||
// Worker layer.
|
||||
startJobsConsumer *startjobsconsumer.Consumer
|
||||
stopJobsConsumer *stopjobsconsumer.Consumer
|
||||
dockerEventsListener *dockerevents.Listener
|
||||
healthProbeWorker *healthprobe.Worker
|
||||
dockerInspectWorker *dockerinspect.Worker
|
||||
reconciler *reconcile.Reconciler
|
||||
containerCleanupWorker *containercleanup.Worker
|
||||
|
||||
// closers releases adapter-level resources at runtime shutdown.
|
||||
closers []func() error
|
||||
}
|
||||
|
||||
// newWiring constructs the process-level dependency set, the persistence
|
||||
// stores, the external adapters, and the service layer. It validates
|
||||
// every required collaborator so callers can rely on them being non-nil.
|
||||
func newWiring(
|
||||
cfg config.Config,
|
||||
redisClient *redis.Client,
|
||||
pgPool *sql.DB,
|
||||
dockerClient *dockerclient.Client,
|
||||
clock func() time.Time,
|
||||
logger *slog.Logger,
|
||||
telemetryRuntime *telemetry.Runtime,
|
||||
) (*wiring, error) {
|
||||
if redisClient == nil {
|
||||
return nil, errors.New("new rtmanager wiring: nil redis client")
|
||||
}
|
||||
if pgPool == nil {
|
||||
return nil, errors.New("new rtmanager wiring: nil postgres pool")
|
||||
}
|
||||
if dockerClient == nil {
|
||||
return nil, errors.New("new rtmanager wiring: nil docker client")
|
||||
}
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
if telemetryRuntime == nil {
|
||||
return nil, fmt.Errorf("new rtmanager wiring: nil telemetry runtime")
|
||||
}
|
||||
|
||||
w := &wiring{
|
||||
cfg: cfg,
|
||||
redisClient: redisClient,
|
||||
pgPool: pgPool,
|
||||
dockerClient: dockerClient,
|
||||
clock: clock,
|
||||
logger: logger,
|
||||
telemetry: telemetryRuntime,
|
||||
}
|
||||
|
||||
if err := w.buildPersistence(); err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
||||
}
|
||||
if err := w.buildAdapters(); err != nil {
|
||||
_ = w.close()
|
||||
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
||||
}
|
||||
if err := w.buildServices(); err != nil {
|
||||
_ = w.close()
|
||||
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
||||
}
|
||||
if err := w.buildWorkers(); err != nil {
|
||||
_ = w.close()
|
||||
return nil, fmt.Errorf("new rtmanager wiring: %w", err)
|
||||
}
|
||||
return w, nil
|
||||
}
|
||||
|
||||
func (w *wiring) buildPersistence() error {
|
||||
runtimeStore, err := runtimerecordstore.New(runtimerecordstore.Config{
|
||||
DB: w.pgPool,
|
||||
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("runtime record store: %w", err)
|
||||
}
|
||||
w.runtimeRecordStore = runtimeStore
|
||||
|
||||
operationStore, err := operationlogstore.New(operationlogstore.Config{
|
||||
DB: w.pgPool,
|
||||
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("operation log store: %w", err)
|
||||
}
|
||||
w.operationLogStore = operationStore
|
||||
|
||||
snapshotStore, err := healthsnapshotstore.New(healthsnapshotstore.Config{
|
||||
DB: w.pgPool,
|
||||
OperationTimeout: w.cfg.Postgres.Conn.OperationTimeout,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("health snapshot store: %w", err)
|
||||
}
|
||||
w.healthSnapshotStore = snapshotStore
|
||||
|
||||
offsetStore, err := streamoffsets.New(streamoffsets.Config{Client: w.redisClient})
|
||||
if err != nil {
|
||||
return fmt.Errorf("stream offset store: %w", err)
|
||||
}
|
||||
w.streamOffsetStore = offsetStore
|
||||
|
||||
leaseStore, err := gamelease.New(gamelease.Config{Client: w.redisClient})
|
||||
if err != nil {
|
||||
return fmt.Errorf("game lease store: %w", err)
|
||||
}
|
||||
w.gameLeaseStore = leaseStore
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *wiring) buildAdapters() error {
|
||||
dockerAdapter, err := docker.NewClient(docker.Config{
|
||||
Docker: w.dockerClient,
|
||||
LogDriver: w.cfg.Docker.LogDriver,
|
||||
LogOpts: w.cfg.Docker.LogOpts,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("docker adapter: %w", err)
|
||||
}
|
||||
w.dockerAdapter = dockerAdapter
|
||||
|
||||
lobby, err := lobbyclient.NewClient(lobbyclient.Config{
|
||||
BaseURL: w.cfg.Lobby.BaseURL,
|
||||
RequestTimeout: w.cfg.Lobby.Timeout,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("lobby client: %w", err)
|
||||
}
|
||||
w.lobbyClient = lobby
|
||||
w.closers = append(w.closers, lobby.Close)
|
||||
|
||||
notificationPub, err := notificationpublisher.NewPublisher(notificationpublisher.Config{
|
||||
Client: w.redisClient,
|
||||
Stream: w.cfg.Streams.NotificationIntents,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("notification publisher: %w", err)
|
||||
}
|
||||
w.notificationPublisher = notificationPub
|
||||
|
||||
healthPub, err := healtheventspublisher.NewPublisher(healtheventspublisher.Config{
|
||||
Client: w.redisClient,
|
||||
Snapshots: w.healthSnapshotStore,
|
||||
Stream: w.cfg.Streams.HealthEvents,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("health events publisher: %w", err)
|
||||
}
|
||||
w.healthEventsPublisher = healthPub
|
||||
|
||||
jobResultsPub, err := jobresultspublisher.NewPublisher(jobresultspublisher.Config{
|
||||
Client: w.redisClient,
|
||||
Stream: w.cfg.Streams.JobResults,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("job results publisher: %w", err)
|
||||
}
|
||||
w.jobResultsPublisher = jobResultsPub
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *wiring) buildServices() error {
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
Docker: w.dockerAdapter,
|
||||
Leases: w.gameLeaseStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
Notifications: w.notificationPublisher,
|
||||
Lobby: w.lobbyClient,
|
||||
Container: w.cfg.Container,
|
||||
DockerCfg: w.cfg.Docker,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Telemetry: w.telemetry,
|
||||
Logger: w.logger,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("start runtime service: %w", err)
|
||||
}
|
||||
w.startRuntimeService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
Docker: w.dockerAdapter,
|
||||
Leases: w.gameLeaseStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
Container: w.cfg.Container,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Telemetry: w.telemetry,
|
||||
Logger: w.logger,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("stop runtime service: %w", err)
|
||||
}
|
||||
w.stopRuntimeService = stopService
|
||||
|
||||
restartService, err := restartruntime.NewService(restartruntime.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
Docker: w.dockerAdapter,
|
||||
Leases: w.gameLeaseStore,
|
||||
StopService: stopService,
|
||||
StartService: startService,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Telemetry: w.telemetry,
|
||||
Logger: w.logger,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("restart runtime service: %w", err)
|
||||
}
|
||||
w.restartRuntimeService = restartService
|
||||
|
||||
patchService, err := patchruntime.NewService(patchruntime.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
Docker: w.dockerAdapter,
|
||||
Leases: w.gameLeaseStore,
|
||||
StopService: stopService,
|
||||
StartService: startService,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Telemetry: w.telemetry,
|
||||
Logger: w.logger,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("patch runtime service: %w", err)
|
||||
}
|
||||
w.patchRuntimeService = patchService
|
||||
|
||||
cleanupService, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
Docker: w.dockerAdapter,
|
||||
Leases: w.gameLeaseStore,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Telemetry: w.telemetry,
|
||||
Logger: w.logger,
|
||||
Clock: w.clock,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("cleanup container service: %w", err)
|
||||
}
|
||||
w.cleanupContainerService = cleanupService
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// buildWorkers constructs the asynchronous Lobby ↔ RTM stream
|
||||
// consumers. Both consumers participate in the process lifecycle as
|
||||
// `app.Component`s; `internal/app/runtime.go` passes them into
|
||||
// `app.New` alongside the internal HTTP server.
|
||||
func (w *wiring) buildWorkers() error {
|
||||
startConsumer, err := startjobsconsumer.NewConsumer(startjobsconsumer.Config{
|
||||
Client: w.redisClient,
|
||||
Stream: w.cfg.Streams.StartJobs,
|
||||
BlockTimeout: w.cfg.Streams.BlockTimeout,
|
||||
StartService: w.startRuntimeService,
|
||||
JobResults: w.jobResultsPublisher,
|
||||
OffsetStore: w.streamOffsetStore,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("start jobs consumer: %w", err)
|
||||
}
|
||||
w.startJobsConsumer = startConsumer
|
||||
|
||||
stopConsumer, err := stopjobsconsumer.NewConsumer(stopjobsconsumer.Config{
|
||||
Client: w.redisClient,
|
||||
Stream: w.cfg.Streams.StopJobs,
|
||||
BlockTimeout: w.cfg.Streams.BlockTimeout,
|
||||
StopService: w.stopRuntimeService,
|
||||
JobResults: w.jobResultsPublisher,
|
||||
OffsetStore: w.streamOffsetStore,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("stop jobs consumer: %w", err)
|
||||
}
|
||||
w.stopJobsConsumer = stopConsumer
|
||||
|
||||
eventsListener, err := dockerevents.NewListener(dockerevents.Dependencies{
|
||||
Docker: w.dockerAdapter,
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
Telemetry: w.telemetry,
|
||||
Clock: w.clock,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("docker events listener: %w", err)
|
||||
}
|
||||
w.dockerEventsListener = eventsListener
|
||||
|
||||
probeHTTPClient, err := newProbeHTTPClient(w.telemetry)
|
||||
if err != nil {
|
||||
return fmt.Errorf("health probe http client: %w", err)
|
||||
}
|
||||
probeWorker, err := healthprobe.NewWorker(healthprobe.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
HTTPClient: probeHTTPClient,
|
||||
Telemetry: w.telemetry,
|
||||
Interval: w.cfg.Health.ProbeInterval,
|
||||
ProbeTimeout: w.cfg.Health.ProbeTimeout,
|
||||
FailuresThreshold: w.cfg.Health.ProbeFailuresThreshold,
|
||||
Clock: w.clock,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("health probe worker: %w", err)
|
||||
}
|
||||
w.healthProbeWorker = probeWorker
|
||||
|
||||
inspectWorker, err := dockerinspect.NewWorker(dockerinspect.Dependencies{
|
||||
Docker: w.dockerAdapter,
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
Telemetry: w.telemetry,
|
||||
Interval: w.cfg.Health.InspectInterval,
|
||||
Clock: w.clock,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("docker inspect worker: %w", err)
|
||||
}
|
||||
w.dockerInspectWorker = inspectWorker
|
||||
|
||||
reconciler, err := reconcile.NewReconciler(reconcile.Dependencies{
|
||||
Docker: w.dockerAdapter,
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
OperationLogs: w.operationLogStore,
|
||||
HealthEvents: w.healthEventsPublisher,
|
||||
Leases: w.gameLeaseStore,
|
||||
Telemetry: w.telemetry,
|
||||
DockerCfg: w.cfg.Docker,
|
||||
ContainerCfg: w.cfg.Container,
|
||||
Coordination: w.cfg.Coordination,
|
||||
Interval: w.cfg.Cleanup.ReconcileInterval,
|
||||
Clock: w.clock,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("reconciler: %w", err)
|
||||
}
|
||||
w.reconciler = reconciler
|
||||
|
||||
cleanupWorker, err := containercleanup.NewWorker(containercleanup.Dependencies{
|
||||
RuntimeRecords: w.runtimeRecordStore,
|
||||
Cleanup: w.cleanupContainerService,
|
||||
Retention: w.cfg.Container.Retention,
|
||||
Interval: w.cfg.Cleanup.CleanupInterval,
|
||||
Clock: w.clock,
|
||||
Logger: w.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("container cleanup worker: %w", err)
|
||||
}
|
||||
w.containerCleanupWorker = cleanupWorker
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// newProbeHTTPClient constructs the otelhttp-instrumented HTTP client
|
||||
// the active health probe uses to call engine `/healthz`. It clones
|
||||
// the default transport so caller-provided transports stay isolated
|
||||
// from production wiring (mirrors the lobby internal client).
|
||||
func newProbeHTTPClient(telemetryRuntime *telemetry.Runtime) (*http.Client, error) {
|
||||
transport, ok := http.DefaultTransport.(*http.Transport)
|
||||
if !ok {
|
||||
return nil, errors.New("default http transport is not *http.Transport")
|
||||
}
|
||||
cloned := transport.Clone()
|
||||
instrumented := otelhttp.NewTransport(cloned,
|
||||
otelhttp.WithTracerProvider(telemetryRuntime.TracerProvider()),
|
||||
otelhttp.WithMeterProvider(telemetryRuntime.MeterProvider()),
|
||||
)
|
||||
return &http.Client{Transport: instrumented}, nil
|
||||
}
|
||||
|
||||
// registerTelemetryGauges installs the runtime-records-by-status gauge
|
||||
// callback so the telemetry runtime can observe the persistent store
|
||||
// without holding a strong reference to the wiring.
|
||||
func (w *wiring) registerTelemetryGauges() error {
|
||||
probe := newRuntimeRecordsProbe(w.runtimeRecordStore)
|
||||
return w.telemetry.RegisterGauges(telemetry.GaugeDependencies{
|
||||
RuntimeRecordsByStatus: probe,
|
||||
Logger: w.logger,
|
||||
})
|
||||
}
|
||||
|
||||
// close releases adapter-level resources owned by the wiring layer.
|
||||
// Returns the joined error of every closer; the caller is expected to
|
||||
// invoke this once during process shutdown.
|
||||
func (w *wiring) close() error {
|
||||
var joined error
|
||||
for index := len(w.closers) - 1; index >= 0; index-- {
|
||||
if err := w.closers[index](); err != nil {
|
||||
joined = errors.Join(joined, err)
|
||||
}
|
||||
}
|
||||
w.closers = nil
|
||||
return joined
|
||||
}
|
||||
|
||||
// runtimeRecordsProbe adapts runtimerecordstore.Store to
|
||||
// telemetry.RuntimeRecordsByStatusProbe by translating the typed status
|
||||
// keys into the string keys the gauge expects.
|
||||
type runtimeRecordsProbe struct {
|
||||
store *runtimerecordstore.Store
|
||||
}
|
||||
|
||||
func newRuntimeRecordsProbe(store *runtimerecordstore.Store) *runtimeRecordsProbe {
|
||||
return &runtimeRecordsProbe{store: store}
|
||||
}
|
||||
|
||||
func (p *runtimeRecordsProbe) CountByStatus(ctx context.Context) (map[string]int, error) {
|
||||
if p == nil || p.store == nil {
|
||||
return nil, errors.New("runtime records probe: nil store")
|
||||
}
|
||||
counts, err := p.store.CountByStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := make(map[string]int, len(counts))
|
||||
for status, count := range counts {
|
||||
out[string(status)] = count
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// Compile-time assertions that the constructed adapters satisfy the
|
||||
// expected port surfaces; these prevent silent regressions when a
|
||||
// port shape changes.
|
||||
var (
|
||||
_ ports.RuntimeRecordStore = (*runtimerecordstore.Store)(nil)
|
||||
_ ports.OperationLogStore = (*operationlogstore.Store)(nil)
|
||||
_ ports.HealthSnapshotStore = (*healthsnapshotstore.Store)(nil)
|
||||
_ ports.StreamOffsetStore = (*streamoffsets.Store)(nil)
|
||||
_ ports.GameLeaseStore = (*gamelease.Store)(nil)
|
||||
_ ports.DockerClient = (*docker.Client)(nil)
|
||||
_ ports.LobbyInternalClient = (*lobbyclient.Client)(nil)
|
||||
_ ports.NotificationIntentPublisher = (*notificationpublisher.Publisher)(nil)
|
||||
_ ports.HealthEventPublisher = (*healtheventspublisher.Publisher)(nil)
|
||||
_ ports.JobResultPublisher = (*jobresultspublisher.Publisher)(nil)
|
||||
|
||||
_ Component = (*reconcile.Reconciler)(nil)
|
||||
_ Component = (*containercleanup.Worker)(nil)
|
||||
_ containercleanup.Cleaner = (*cleanupcontainer.Service)(nil)
|
||||
)
|
||||
|
||||
@@ -0,0 +1,632 @@
|
||||
// Package config loads the Runtime Manager process configuration from
|
||||
// environment variables.
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const (
|
||||
envPrefix = "RTMANAGER"
|
||||
|
||||
shutdownTimeoutEnvVar = "RTMANAGER_SHUTDOWN_TIMEOUT"
|
||||
logLevelEnvVar = "RTMANAGER_LOG_LEVEL"
|
||||
|
||||
internalHTTPAddrEnvVar = "RTMANAGER_INTERNAL_HTTP_ADDR"
|
||||
internalHTTPReadHeaderTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_HEADER_TIMEOUT"
|
||||
internalHTTPReadTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_READ_TIMEOUT"
|
||||
internalHTTPWriteTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_WRITE_TIMEOUT"
|
||||
internalHTTPIdleTimeoutEnvVar = "RTMANAGER_INTERNAL_HTTP_IDLE_TIMEOUT"
|
||||
|
||||
dockerHostEnvVar = "RTMANAGER_DOCKER_HOST"
|
||||
dockerAPIVersionEnvVar = "RTMANAGER_DOCKER_API_VERSION"
|
||||
dockerNetworkEnvVar = "RTMANAGER_DOCKER_NETWORK"
|
||||
dockerLogDriverEnvVar = "RTMANAGER_DOCKER_LOG_DRIVER"
|
||||
dockerLogOptsEnvVar = "RTMANAGER_DOCKER_LOG_OPTS"
|
||||
imagePullPolicyEnvVar = "RTMANAGER_IMAGE_PULL_POLICY"
|
||||
|
||||
defaultCPUQuotaEnvVar = "RTMANAGER_DEFAULT_CPU_QUOTA"
|
||||
defaultMemoryEnvVar = "RTMANAGER_DEFAULT_MEMORY"
|
||||
defaultPIDsLimitEnvVar = "RTMANAGER_DEFAULT_PIDS_LIMIT"
|
||||
containerStopTimeoutSecondsEnvVar = "RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS"
|
||||
containerRetentionDaysEnvVar = "RTMANAGER_CONTAINER_RETENTION_DAYS"
|
||||
engineStateMountPathEnvVar = "RTMANAGER_ENGINE_STATE_MOUNT_PATH"
|
||||
engineStateEnvNameEnvVar = "RTMANAGER_ENGINE_STATE_ENV_NAME"
|
||||
gameStateDirModeEnvVar = "RTMANAGER_GAME_STATE_DIR_MODE"
|
||||
gameStateOwnerUIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_UID"
|
||||
gameStateOwnerGIDEnvVar = "RTMANAGER_GAME_STATE_OWNER_GID"
|
||||
gameStateRootEnvVar = "RTMANAGER_GAME_STATE_ROOT"
|
||||
|
||||
startJobsStreamEnvVar = "RTMANAGER_REDIS_START_JOBS_STREAM"
|
||||
stopJobsStreamEnvVar = "RTMANAGER_REDIS_STOP_JOBS_STREAM"
|
||||
jobResultsStreamEnvVar = "RTMANAGER_REDIS_JOB_RESULTS_STREAM"
|
||||
healthEventsStreamEnvVar = "RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"
|
||||
notificationIntentsStreamEnv = "RTMANAGER_NOTIFICATION_INTENTS_STREAM"
|
||||
streamBlockTimeoutEnvVar = "RTMANAGER_STREAM_BLOCK_TIMEOUT"
|
||||
|
||||
inspectIntervalEnvVar = "RTMANAGER_INSPECT_INTERVAL"
|
||||
probeIntervalEnvVar = "RTMANAGER_PROBE_INTERVAL"
|
||||
probeTimeoutEnvVar = "RTMANAGER_PROBE_TIMEOUT"
|
||||
probeFailuresThresholdEnvVar = "RTMANAGER_PROBE_FAILURES_THRESHOLD"
|
||||
|
||||
reconcileIntervalEnvVar = "RTMANAGER_RECONCILE_INTERVAL"
|
||||
cleanupIntervalEnvVar = "RTMANAGER_CLEANUP_INTERVAL"
|
||||
|
||||
gameLeaseTTLSecondsEnvVar = "RTMANAGER_GAME_LEASE_TTL_SECONDS"
|
||||
|
||||
lobbyInternalBaseURLEnvVar = "RTMANAGER_LOBBY_INTERNAL_BASE_URL"
|
||||
lobbyInternalTimeoutEnvVar = "RTMANAGER_LOBBY_INTERNAL_TIMEOUT"
|
||||
|
||||
otelServiceNameEnvVar = "OTEL_SERVICE_NAME"
|
||||
otelTracesExporterEnvVar = "OTEL_TRACES_EXPORTER"
|
||||
otelMetricsExporterEnvVar = "OTEL_METRICS_EXPORTER"
|
||||
otelExporterOTLPProtocolEnvVar = "OTEL_EXPORTER_OTLP_PROTOCOL"
|
||||
otelExporterOTLPTracesProtocolEnvVar = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"
|
||||
otelExporterOTLPMetricsProtocolEnvVar = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"
|
||||
otelStdoutTracesEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_TRACES_ENABLED"
|
||||
otelStdoutMetricsEnabledEnvVar = "RTMANAGER_OTEL_STDOUT_METRICS_ENABLED"
|
||||
|
||||
defaultShutdownTimeout = 30 * time.Second
|
||||
defaultLogLevel = "info"
|
||||
defaultInternalHTTPAddr = ":8096"
|
||||
defaultReadHeaderTimeout = 2 * time.Second
|
||||
defaultReadTimeout = 5 * time.Second
|
||||
defaultWriteTimeout = 15 * time.Second
|
||||
defaultIdleTimeout = 60 * time.Second
|
||||
|
||||
defaultDockerHost = "unix:///var/run/docker.sock"
|
||||
defaultDockerNetwork = "galaxy-net"
|
||||
defaultDockerLogDriver = "json-file"
|
||||
defaultImagePullPolicy = ImagePullPolicyIfMissing
|
||||
|
||||
defaultCPUQuota = 1.0
|
||||
defaultMemory = "512m"
|
||||
defaultPIDsLimit = 512
|
||||
defaultContainerStopTimeout = 30 * time.Second
|
||||
defaultContainerRetention = 30 * 24 * time.Hour
|
||||
defaultEngineStateMountPath = "/var/lib/galaxy-game"
|
||||
defaultEngineStateEnvName = "GAME_STATE_PATH"
|
||||
defaultGameStateDirMode = 0o750
|
||||
|
||||
defaultStartJobsStream = "runtime:start_jobs"
|
||||
defaultStopJobsStream = "runtime:stop_jobs"
|
||||
defaultJobResultsStream = "runtime:job_results"
|
||||
defaultHealthEventsStream = "runtime:health_events"
|
||||
defaultNotificationIntentsKey = "notification:intents"
|
||||
defaultStreamBlockTimeout = 5 * time.Second
|
||||
|
||||
defaultInspectInterval = 30 * time.Second
|
||||
defaultProbeInterval = 15 * time.Second
|
||||
defaultProbeTimeout = 2 * time.Second
|
||||
defaultProbeFailuresThreshold = 3
|
||||
|
||||
defaultReconcileInterval = 5 * time.Minute
|
||||
defaultCleanupInterval = time.Hour
|
||||
|
||||
defaultGameLeaseTTL = 60 * time.Second
|
||||
|
||||
defaultLobbyInternalTimeout = 2 * time.Second
|
||||
|
||||
defaultOTelServiceName = "galaxy-rtmanager"
|
||||
)
|
||||
|
||||
// ImagePullPolicy enumerates the supported image pull policies. The start
|
||||
// service validates a producer-supplied `image_ref` against this policy at
|
||||
// start time.
|
||||
type ImagePullPolicy string
|
||||
|
||||
// Supported pull policies, frozen by `rtmanager/README.md` §Configuration.
|
||||
const (
|
||||
ImagePullPolicyIfMissing ImagePullPolicy = "if_missing"
|
||||
ImagePullPolicyAlways ImagePullPolicy = "always"
|
||||
ImagePullPolicyNever ImagePullPolicy = "never"
|
||||
)
|
||||
|
||||
// Validate reports whether p is one of the frozen pull policies.
|
||||
func (p ImagePullPolicy) Validate() error {
|
||||
switch p {
|
||||
case ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("image pull policy %q must be one of %q, %q, %q",
|
||||
p, ImagePullPolicyIfMissing, ImagePullPolicyAlways, ImagePullPolicyNever)
|
||||
}
|
||||
}
|
||||
|
||||
// Config stores the full Runtime Manager process configuration.
|
||||
type Config struct {
|
||||
// ShutdownTimeout bounds graceful shutdown of every long-lived
|
||||
// component.
|
||||
ShutdownTimeout time.Duration
|
||||
|
||||
// Logging configures the process-wide structured logger.
|
||||
Logging LoggingConfig
|
||||
|
||||
// InternalHTTP configures the trusted internal HTTP listener that
|
||||
// serves probes and the GM/Admin REST surface.
|
||||
InternalHTTP InternalHTTPConfig
|
||||
|
||||
// Docker configures the Docker SDK client RTM uses to drive the local
|
||||
// Docker daemon.
|
||||
Docker DockerConfig
|
||||
|
||||
// Postgres configures the PostgreSQL-backed durable store consumed via
|
||||
// `pkg/postgres`.
|
||||
Postgres PostgresConfig
|
||||
|
||||
// Redis configures the shared Redis connection topology consumed via
|
||||
// `pkg/redisconn`.
|
||||
Redis RedisConfig
|
||||
|
||||
// Streams stores the stable Redis Stream names RTM reads from and
|
||||
// writes to.
|
||||
Streams StreamsConfig
|
||||
|
||||
// Container stores the per-container defaults applied at start time
|
||||
// when the resolved image does not declare its own labels.
|
||||
Container ContainerConfig
|
||||
|
||||
// Health configures the periodic health-monitoring workers (events
|
||||
// listener, inspect, active probe).
|
||||
Health HealthConfig
|
||||
|
||||
// Cleanup configures the reconciler and container-cleanup workers.
|
||||
Cleanup CleanupConfig
|
||||
|
||||
// Coordination configures the per-game Redis lease used to serialise
|
||||
// operations across all entry points.
|
||||
Coordination CoordinationConfig
|
||||
|
||||
// Lobby configures the synchronous Lobby internal REST client used by
|
||||
// the start service for ancillary lookups.
|
||||
Lobby LobbyConfig
|
||||
|
||||
// Telemetry configures the process-wide OpenTelemetry runtime.
|
||||
Telemetry TelemetryConfig
|
||||
}
|
||||
|
||||
// LoggingConfig configures the process-wide structured logger.
|
||||
type LoggingConfig struct {
|
||||
// Level stores the process log level accepted by log/slog.
|
||||
Level string
|
||||
}
|
||||
|
||||
// InternalHTTPConfig configures the trusted internal HTTP listener.
|
||||
type InternalHTTPConfig struct {
|
||||
// Addr stores the TCP listen address.
|
||||
Addr string
|
||||
|
||||
// ReadHeaderTimeout bounds request-header reading.
|
||||
ReadHeaderTimeout time.Duration
|
||||
|
||||
// ReadTimeout bounds reading one request.
|
||||
ReadTimeout time.Duration
|
||||
|
||||
// WriteTimeout bounds writing one response.
|
||||
WriteTimeout time.Duration
|
||||
|
||||
// IdleTimeout bounds how long keep-alive connections stay open.
|
||||
IdleTimeout time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable internal HTTP listener
|
||||
// configuration.
|
||||
func (cfg InternalHTTPConfig) Validate() error {
|
||||
switch {
|
||||
case strings.TrimSpace(cfg.Addr) == "":
|
||||
return fmt.Errorf("internal HTTP addr must not be empty")
|
||||
case !isTCPAddr(cfg.Addr):
|
||||
return fmt.Errorf("internal HTTP addr %q must use host:port form", cfg.Addr)
|
||||
case cfg.ReadHeaderTimeout <= 0:
|
||||
return fmt.Errorf("internal HTTP read header timeout must be positive")
|
||||
case cfg.ReadTimeout <= 0:
|
||||
return fmt.Errorf("internal HTTP read timeout must be positive")
|
||||
case cfg.WriteTimeout <= 0:
|
||||
return fmt.Errorf("internal HTTP write timeout must be positive")
|
||||
case cfg.IdleTimeout <= 0:
|
||||
return fmt.Errorf("internal HTTP idle timeout must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// DockerConfig configures the Docker SDK client.
|
||||
type DockerConfig struct {
|
||||
// Host stores the Docker daemon endpoint (e.g.
|
||||
// `unix:///var/run/docker.sock`).
|
||||
Host string
|
||||
|
||||
// APIVersion overrides the Docker API version. Empty lets the SDK
|
||||
// negotiate.
|
||||
APIVersion string
|
||||
|
||||
// Network stores the user-defined Docker bridge network containers
|
||||
// attach to. Provisioned outside RTM; missing network is a fail-fast
|
||||
// condition at startup.
|
||||
Network string
|
||||
|
||||
// LogDriver stores the Docker logging driver applied to engine
|
||||
// containers.
|
||||
LogDriver string
|
||||
|
||||
// LogOpts stores the comma-separated `key=value` driver options.
|
||||
LogOpts string
|
||||
|
||||
// PullPolicy stores the configured image pull policy.
|
||||
PullPolicy ImagePullPolicy
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable Docker configuration.
|
||||
func (cfg DockerConfig) Validate() error {
|
||||
switch {
|
||||
case strings.TrimSpace(cfg.Host) == "":
|
||||
return fmt.Errorf("docker host must not be empty")
|
||||
case strings.TrimSpace(cfg.Network) == "":
|
||||
return fmt.Errorf("docker network must not be empty")
|
||||
case strings.TrimSpace(cfg.LogDriver) == "":
|
||||
return fmt.Errorf("docker log driver must not be empty")
|
||||
}
|
||||
return cfg.PullPolicy.Validate()
|
||||
}
|
||||
|
||||
// PostgresConfig configures the PostgreSQL-backed durable store consumed
|
||||
// via `pkg/postgres`.
|
||||
type PostgresConfig struct {
|
||||
// Conn carries the primary plus replica DSN topology and pool tuning.
|
||||
Conn postgres.Config
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable PostgreSQL configuration.
|
||||
func (cfg PostgresConfig) Validate() error {
|
||||
return cfg.Conn.Validate()
|
||||
}
|
||||
|
||||
// RedisConfig configures the Runtime Manager Redis connection topology.
|
||||
type RedisConfig struct {
|
||||
// Conn carries the connection topology (master, replicas, password,
|
||||
// db, per-call timeout).
|
||||
Conn redisconn.Config
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable Redis configuration.
|
||||
func (cfg RedisConfig) Validate() error {
|
||||
return cfg.Conn.Validate()
|
||||
}
|
||||
|
||||
// StreamsConfig stores the stable Redis Stream names used by Runtime
|
||||
// Manager.
|
||||
type StreamsConfig struct {
|
||||
// StartJobs stores the Redis Streams key Lobby writes start jobs to.
|
||||
StartJobs string
|
||||
|
||||
// StopJobs stores the Redis Streams key Lobby writes stop jobs to.
|
||||
StopJobs string
|
||||
|
||||
// JobResults stores the Redis Streams key RTM writes job outcomes
|
||||
// to.
|
||||
JobResults string
|
||||
|
||||
// HealthEvents stores the Redis Streams key RTM publishes
|
||||
// technical health events to.
|
||||
HealthEvents string
|
||||
|
||||
// NotificationIntents stores the Redis Streams key RTM publishes
|
||||
// admin-only notification intents to.
|
||||
NotificationIntents string
|
||||
|
||||
// BlockTimeout bounds the maximum blocking read window for stream
|
||||
// consumers.
|
||||
BlockTimeout time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores usable stream names.
|
||||
func (cfg StreamsConfig) Validate() error {
|
||||
switch {
|
||||
case strings.TrimSpace(cfg.StartJobs) == "":
|
||||
return fmt.Errorf("redis start jobs stream must not be empty")
|
||||
case strings.TrimSpace(cfg.StopJobs) == "":
|
||||
return fmt.Errorf("redis stop jobs stream must not be empty")
|
||||
case strings.TrimSpace(cfg.JobResults) == "":
|
||||
return fmt.Errorf("redis job results stream must not be empty")
|
||||
case strings.TrimSpace(cfg.HealthEvents) == "":
|
||||
return fmt.Errorf("redis health events stream must not be empty")
|
||||
case strings.TrimSpace(cfg.NotificationIntents) == "":
|
||||
return fmt.Errorf("redis notification intents stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return fmt.Errorf("redis stream block timeout must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// ContainerConfig stores the per-container defaults applied at start
|
||||
// time. Resource defaults apply when the resolved engine image does not
|
||||
// expose `com.galaxy.cpu_quota` / `com.galaxy.memory` /
|
||||
// `com.galaxy.pids_limit` labels.
|
||||
type ContainerConfig struct {
|
||||
// DefaultCPUQuota is the fallback `--cpus` value applied when the
|
||||
// image does not declare `com.galaxy.cpu_quota`.
|
||||
DefaultCPUQuota float64
|
||||
|
||||
// DefaultMemory is the fallback `--memory` value applied when the
|
||||
// image does not declare `com.galaxy.memory`.
|
||||
DefaultMemory string
|
||||
|
||||
// DefaultPIDsLimit is the fallback `--pids-limit` value applied
|
||||
// when the image does not declare `com.galaxy.pids_limit`.
|
||||
DefaultPIDsLimit int
|
||||
|
||||
// StopTimeout bounds graceful container stop before Docker fires
|
||||
// SIGKILL.
|
||||
StopTimeout time.Duration
|
||||
|
||||
// Retention stores the TTL after which `status=stopped` containers
|
||||
// are removed by the cleanup worker.
|
||||
Retention time.Duration
|
||||
|
||||
// EngineStateMountPath is the in-container path the per-game state
|
||||
// directory is bind-mounted to.
|
||||
EngineStateMountPath string
|
||||
|
||||
// EngineStateEnvName is the env-var name forwarded to the engine
|
||||
// pointing at EngineStateMountPath.
|
||||
EngineStateEnvName string
|
||||
|
||||
// GameStateDirMode stores the unix permissions applied to the
|
||||
// per-game state directory on creation.
|
||||
GameStateDirMode uint32
|
||||
|
||||
// GameStateOwnerUID stores the unix uid applied to the per-game
|
||||
// state directory on creation.
|
||||
GameStateOwnerUID int
|
||||
|
||||
// GameStateOwnerGID stores the unix gid applied to the per-game
|
||||
// state directory on creation.
|
||||
GameStateOwnerGID int
|
||||
|
||||
// GameStateRoot is the host path under which per-game state
|
||||
// directories are created.
|
||||
GameStateRoot string
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores usable container defaults.
|
||||
func (cfg ContainerConfig) Validate() error {
|
||||
switch {
|
||||
case cfg.DefaultCPUQuota <= 0:
|
||||
return fmt.Errorf("default cpu quota must be positive")
|
||||
case strings.TrimSpace(cfg.DefaultMemory) == "":
|
||||
return fmt.Errorf("default memory must not be empty")
|
||||
case cfg.DefaultPIDsLimit <= 0:
|
||||
return fmt.Errorf("default pids limit must be positive")
|
||||
case cfg.StopTimeout <= 0:
|
||||
return fmt.Errorf("container stop timeout must be positive")
|
||||
case cfg.Retention <= 0:
|
||||
return fmt.Errorf("container retention must be positive")
|
||||
case strings.TrimSpace(cfg.EngineStateMountPath) == "":
|
||||
return fmt.Errorf("engine state mount path must not be empty")
|
||||
case strings.TrimSpace(cfg.EngineStateEnvName) == "":
|
||||
return fmt.Errorf("engine state env name must not be empty")
|
||||
case cfg.GameStateDirMode == 0:
|
||||
return fmt.Errorf("game state dir mode must be non-zero")
|
||||
case strings.TrimSpace(cfg.GameStateRoot) == "":
|
||||
return fmt.Errorf("game state root must not be empty")
|
||||
case !strings.HasPrefix(strings.TrimSpace(cfg.GameStateRoot), "/"):
|
||||
return fmt.Errorf("game state root %q must be an absolute path", cfg.GameStateRoot)
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// HealthConfig configures the periodic health-monitoring workers
|
||||
// (Docker events listener, periodic inspect, active probe).
|
||||
type HealthConfig struct {
|
||||
// InspectInterval is the period between two periodic Docker inspect
|
||||
// passes.
|
||||
InspectInterval time.Duration
|
||||
|
||||
// ProbeInterval is the period between two engine `/healthz` probe
|
||||
// rounds.
|
||||
ProbeInterval time.Duration
|
||||
|
||||
// ProbeTimeout bounds one engine `/healthz` request.
|
||||
ProbeTimeout time.Duration
|
||||
|
||||
// ProbeFailuresThreshold is the consecutive-failure count that
|
||||
// triggers a `probe_failed` event.
|
||||
ProbeFailuresThreshold int
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores usable health-monitoring settings.
|
||||
func (cfg HealthConfig) Validate() error {
|
||||
switch {
|
||||
case cfg.InspectInterval <= 0:
|
||||
return fmt.Errorf("inspect interval must be positive")
|
||||
case cfg.ProbeInterval <= 0:
|
||||
return fmt.Errorf("probe interval must be positive")
|
||||
case cfg.ProbeTimeout <= 0:
|
||||
return fmt.Errorf("probe timeout must be positive")
|
||||
case cfg.ProbeFailuresThreshold <= 0:
|
||||
return fmt.Errorf("probe failures threshold must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// CleanupConfig configures the reconciler and container-cleanup workers.
|
||||
type CleanupConfig struct {
|
||||
// ReconcileInterval is the period between two reconciler passes.
|
||||
ReconcileInterval time.Duration
|
||||
|
||||
// CleanupInterval is the period between two container-cleanup
|
||||
// passes.
|
||||
CleanupInterval time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores usable cleanup settings.
|
||||
func (cfg CleanupConfig) Validate() error {
|
||||
switch {
|
||||
case cfg.ReconcileInterval <= 0:
|
||||
return fmt.Errorf("reconcile interval must be positive")
|
||||
case cfg.CleanupInterval <= 0:
|
||||
return fmt.Errorf("cleanup interval must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// CoordinationConfig configures the per-game Redis lease.
|
||||
type CoordinationConfig struct {
|
||||
// GameLeaseTTL bounds the per-game lease lifetime renewed every
|
||||
// half-TTL while an operation runs.
|
||||
GameLeaseTTL time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable lease configuration.
|
||||
func (cfg CoordinationConfig) Validate() error {
|
||||
if cfg.GameLeaseTTL <= 0 {
|
||||
return fmt.Errorf("game lease ttl must be positive")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LobbyConfig configures the synchronous Lobby internal REST client.
|
||||
type LobbyConfig struct {
|
||||
// BaseURL stores the trusted Lobby internal listener base URL.
|
||||
BaseURL string
|
||||
|
||||
// Timeout bounds one Lobby internal request.
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
// Validate reports whether cfg stores a usable Lobby client
|
||||
// configuration.
|
||||
func (cfg LobbyConfig) Validate() error {
|
||||
switch {
|
||||
case strings.TrimSpace(cfg.BaseURL) == "":
|
||||
return fmt.Errorf("lobby internal base url must not be empty")
|
||||
case !isHTTPURL(cfg.BaseURL):
|
||||
return fmt.Errorf("lobby internal base url %q must be an absolute http(s) URL", cfg.BaseURL)
|
||||
case cfg.Timeout <= 0:
|
||||
return fmt.Errorf("lobby internal timeout must be positive")
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// TelemetryConfig configures the Runtime Manager OpenTelemetry runtime.
|
||||
type TelemetryConfig struct {
|
||||
// ServiceName overrides the default OpenTelemetry service name.
|
||||
ServiceName string
|
||||
|
||||
// TracesExporter selects the external traces exporter. Supported
|
||||
// values are `none` and `otlp`.
|
||||
TracesExporter string
|
||||
|
||||
// MetricsExporter selects the external metrics exporter. Supported
|
||||
// values are `none` and `otlp`.
|
||||
MetricsExporter string
|
||||
|
||||
// TracesProtocol selects the OTLP traces protocol when
|
||||
// TracesExporter is `otlp`.
|
||||
TracesProtocol string
|
||||
|
||||
// MetricsProtocol selects the OTLP metrics protocol when
|
||||
// MetricsExporter is `otlp`.
|
||||
MetricsProtocol string
|
||||
|
||||
// StdoutTracesEnabled enables the additional stdout trace exporter
|
||||
// used for local development and debugging.
|
||||
StdoutTracesEnabled bool
|
||||
|
||||
// StdoutMetricsEnabled enables the additional stdout metric
|
||||
// exporter used for local development and debugging.
|
||||
StdoutMetricsEnabled bool
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a supported OpenTelemetry
|
||||
// configuration.
|
||||
func (cfg TelemetryConfig) Validate() error {
|
||||
return telemetry.ProcessConfig{
|
||||
ServiceName: cfg.ServiceName,
|
||||
TracesExporter: cfg.TracesExporter,
|
||||
MetricsExporter: cfg.MetricsExporter,
|
||||
TracesProtocol: cfg.TracesProtocol,
|
||||
MetricsProtocol: cfg.MetricsProtocol,
|
||||
StdoutTracesEnabled: cfg.StdoutTracesEnabled,
|
||||
StdoutMetricsEnabled: cfg.StdoutMetricsEnabled,
|
||||
}.Validate()
|
||||
}
|
||||
|
||||
// DefaultConfig returns the default Runtime Manager process configuration.
|
||||
func DefaultConfig() Config {
|
||||
return Config{
|
||||
ShutdownTimeout: defaultShutdownTimeout,
|
||||
Logging: LoggingConfig{
|
||||
Level: defaultLogLevel,
|
||||
},
|
||||
InternalHTTP: InternalHTTPConfig{
|
||||
Addr: defaultInternalHTTPAddr,
|
||||
ReadHeaderTimeout: defaultReadHeaderTimeout,
|
||||
ReadTimeout: defaultReadTimeout,
|
||||
WriteTimeout: defaultWriteTimeout,
|
||||
IdleTimeout: defaultIdleTimeout,
|
||||
},
|
||||
Docker: DockerConfig{
|
||||
Host: defaultDockerHost,
|
||||
Network: defaultDockerNetwork,
|
||||
LogDriver: defaultDockerLogDriver,
|
||||
PullPolicy: defaultImagePullPolicy,
|
||||
},
|
||||
Postgres: PostgresConfig{
|
||||
Conn: postgres.DefaultConfig(),
|
||||
},
|
||||
Redis: RedisConfig{
|
||||
Conn: redisconn.DefaultConfig(),
|
||||
},
|
||||
Streams: StreamsConfig{
|
||||
StartJobs: defaultStartJobsStream,
|
||||
StopJobs: defaultStopJobsStream,
|
||||
JobResults: defaultJobResultsStream,
|
||||
HealthEvents: defaultHealthEventsStream,
|
||||
NotificationIntents: defaultNotificationIntentsKey,
|
||||
BlockTimeout: defaultStreamBlockTimeout,
|
||||
},
|
||||
Container: ContainerConfig{
|
||||
DefaultCPUQuota: defaultCPUQuota,
|
||||
DefaultMemory: defaultMemory,
|
||||
DefaultPIDsLimit: defaultPIDsLimit,
|
||||
StopTimeout: defaultContainerStopTimeout,
|
||||
Retention: defaultContainerRetention,
|
||||
EngineStateMountPath: defaultEngineStateMountPath,
|
||||
EngineStateEnvName: defaultEngineStateEnvName,
|
||||
GameStateDirMode: defaultGameStateDirMode,
|
||||
},
|
||||
Health: HealthConfig{
|
||||
InspectInterval: defaultInspectInterval,
|
||||
ProbeInterval: defaultProbeInterval,
|
||||
ProbeTimeout: defaultProbeTimeout,
|
||||
ProbeFailuresThreshold: defaultProbeFailuresThreshold,
|
||||
},
|
||||
Cleanup: CleanupConfig{
|
||||
ReconcileInterval: defaultReconcileInterval,
|
||||
CleanupInterval: defaultCleanupInterval,
|
||||
},
|
||||
Coordination: CoordinationConfig{
|
||||
GameLeaseTTL: defaultGameLeaseTTL,
|
||||
},
|
||||
Lobby: LobbyConfig{
|
||||
Timeout: defaultLobbyInternalTimeout,
|
||||
},
|
||||
Telemetry: TelemetryConfig{
|
||||
ServiceName: defaultOTelServiceName,
|
||||
TracesExporter: "none",
|
||||
MetricsExporter: "none",
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func validEnv(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy?search_path=rtmanager&sslmode=disable")
|
||||
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
|
||||
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
|
||||
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
|
||||
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
|
||||
}
|
||||
|
||||
func TestLoadFromEnvAcceptsDefaults(t *testing.T) {
|
||||
validEnv(t)
|
||||
|
||||
cfg, err := LoadFromEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, ":8096", cfg.InternalHTTP.Addr)
|
||||
require.Equal(t, "unix:///var/run/docker.sock", cfg.Docker.Host)
|
||||
require.Equal(t, "galaxy-net", cfg.Docker.Network)
|
||||
require.Equal(t, "json-file", cfg.Docker.LogDriver)
|
||||
require.Equal(t, ImagePullPolicyIfMissing, cfg.Docker.PullPolicy)
|
||||
require.Equal(t, "runtime:start_jobs", cfg.Streams.StartJobs)
|
||||
require.Equal(t, "runtime:stop_jobs", cfg.Streams.StopJobs)
|
||||
require.Equal(t, "runtime:job_results", cfg.Streams.JobResults)
|
||||
require.Equal(t, "runtime:health_events", cfg.Streams.HealthEvents)
|
||||
require.Equal(t, "notification:intents", cfg.Streams.NotificationIntents)
|
||||
require.Equal(t, 30*time.Second, cfg.Container.StopTimeout)
|
||||
require.Equal(t, 30*24*time.Hour, cfg.Container.Retention)
|
||||
require.Equal(t, "/var/lib/galaxy-game", cfg.Container.EngineStateMountPath)
|
||||
require.Equal(t, "GAME_STATE_PATH", cfg.Container.EngineStateEnvName)
|
||||
require.EqualValues(t, 0o750, cfg.Container.GameStateDirMode)
|
||||
require.Equal(t, 60*time.Second, cfg.Coordination.GameLeaseTTL)
|
||||
require.Equal(t, "http://lobby:8095", cfg.Lobby.BaseURL)
|
||||
require.Equal(t, 2*time.Second, cfg.Lobby.Timeout)
|
||||
require.Equal(t, "galaxy-rtmanager", cfg.Telemetry.ServiceName)
|
||||
}
|
||||
|
||||
func TestLoadFromEnvHonoursOverrides(t *testing.T) {
|
||||
validEnv(t)
|
||||
t.Setenv("RTMANAGER_INTERNAL_HTTP_ADDR", ":9000")
|
||||
t.Setenv("RTMANAGER_DOCKER_NETWORK", "custom-net")
|
||||
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "always")
|
||||
t.Setenv("RTMANAGER_REDIS_START_JOBS_STREAM", "custom:start_jobs")
|
||||
t.Setenv("RTMANAGER_GAME_LEASE_TTL_SECONDS", "120")
|
||||
t.Setenv("RTMANAGER_CONTAINER_STOP_TIMEOUT_SECONDS", "45")
|
||||
t.Setenv("RTMANAGER_CONTAINER_RETENTION_DAYS", "7")
|
||||
t.Setenv("RTMANAGER_GAME_STATE_DIR_MODE", "0700")
|
||||
|
||||
cfg, err := LoadFromEnv()
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, ":9000", cfg.InternalHTTP.Addr)
|
||||
require.Equal(t, "custom-net", cfg.Docker.Network)
|
||||
require.Equal(t, ImagePullPolicyAlways, cfg.Docker.PullPolicy)
|
||||
require.Equal(t, "custom:start_jobs", cfg.Streams.StartJobs)
|
||||
require.Equal(t, 120*time.Second, cfg.Coordination.GameLeaseTTL)
|
||||
require.Equal(t, 45*time.Second, cfg.Container.StopTimeout)
|
||||
require.Equal(t, 7*24*time.Hour, cfg.Container.Retention)
|
||||
require.EqualValues(t, 0o700, cfg.Container.GameStateDirMode)
|
||||
}
|
||||
|
||||
func TestLoadFromEnvRejectsUnknownPullPolicy(t *testing.T) {
|
||||
validEnv(t)
|
||||
t.Setenv("RTMANAGER_IMAGE_PULL_POLICY", "weekly")
|
||||
|
||||
_, err := LoadFromEnv()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "image pull policy")
|
||||
}
|
||||
|
||||
func TestLoadFromEnvRequiresGameStateRoot(t *testing.T) {
|
||||
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
|
||||
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
|
||||
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
|
||||
t.Setenv("RTMANAGER_LOBBY_INTERNAL_BASE_URL", "http://lobby:8095")
|
||||
|
||||
_, err := LoadFromEnv()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "RTMANAGER_GAME_STATE_ROOT")
|
||||
}
|
||||
|
||||
func TestLoadFromEnvRequiresLobbyBaseURL(t *testing.T) {
|
||||
t.Setenv("RTMANAGER_POSTGRES_PRIMARY_DSN", "postgres://rtm:secret@localhost:5432/galaxy")
|
||||
t.Setenv("RTMANAGER_REDIS_MASTER_ADDR", "localhost:6379")
|
||||
t.Setenv("RTMANAGER_REDIS_PASSWORD", "secret")
|
||||
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "/var/lib/galaxy/games")
|
||||
|
||||
_, err := LoadFromEnv()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "RTMANAGER_LOBBY_INTERNAL_BASE_URL")
|
||||
}
|
||||
|
||||
func TestLoadFromEnvRejectsRelativeStateRoot(t *testing.T) {
|
||||
validEnv(t)
|
||||
t.Setenv("RTMANAGER_GAME_STATE_ROOT", "relative/path")
|
||||
|
||||
_, err := LoadFromEnv()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "absolute path")
|
||||
}
|
||||
|
||||
func TestLoadFromEnvRejectsBadLogLevel(t *testing.T) {
|
||||
validEnv(t)
|
||||
t.Setenv("RTMANAGER_LOG_LEVEL", "verbose")
|
||||
|
||||
_, err := LoadFromEnv()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "RTMANAGER_LOG_LEVEL")
|
||||
}
|
||||
|
||||
func TestImagePullPolicyValidate(t *testing.T) {
|
||||
require.NoError(t, ImagePullPolicyIfMissing.Validate())
|
||||
require.NoError(t, ImagePullPolicyAlways.Validate())
|
||||
require.NoError(t, ImagePullPolicyNever.Validate())
|
||||
require.Error(t, ImagePullPolicy("monthly").Validate())
|
||||
}
|
||||
|
||||
func TestInternalHTTPValidateRejectsBadAddr(t *testing.T) {
|
||||
cfg := DefaultConfig().InternalHTTP
|
||||
cfg.Addr = "not-an-addr"
|
||||
err := cfg.Validate()
|
||||
require.Error(t, err)
|
||||
require.Contains(t, err.Error(), "host:port")
|
||||
}
|
||||
|
||||
func TestStreamsValidateRequiresAllNames(t *testing.T) {
|
||||
cfg := DefaultConfig().Streams
|
||||
cfg.StartJobs = " "
|
||||
err := cfg.Validate()
|
||||
require.Error(t, err)
|
||||
require.True(t, strings.Contains(err.Error(), "start jobs"))
|
||||
}
|
||||
@@ -0,0 +1,319 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/postgres"
|
||||
"galaxy/redisconn"
|
||||
)
|
||||
|
||||
// LoadFromEnv builds Config from environment variables and validates the
|
||||
// resulting configuration.
|
||||
func LoadFromEnv() (Config, error) {
|
||||
cfg := DefaultConfig()
|
||||
|
||||
var err error
|
||||
|
||||
cfg.ShutdownTimeout, err = durationEnv(shutdownTimeoutEnvVar, cfg.ShutdownTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Logging.Level = stringEnv(logLevelEnvVar, cfg.Logging.Level)
|
||||
|
||||
cfg.InternalHTTP.Addr = stringEnv(internalHTTPAddrEnvVar, cfg.InternalHTTP.Addr)
|
||||
cfg.InternalHTTP.ReadHeaderTimeout, err = durationEnv(internalHTTPReadHeaderTimeoutEnvVar, cfg.InternalHTTP.ReadHeaderTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.InternalHTTP.ReadTimeout, err = durationEnv(internalHTTPReadTimeoutEnvVar, cfg.InternalHTTP.ReadTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.InternalHTTP.WriteTimeout, err = durationEnv(internalHTTPWriteTimeoutEnvVar, cfg.InternalHTTP.WriteTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.InternalHTTP.IdleTimeout, err = durationEnv(internalHTTPIdleTimeoutEnvVar, cfg.InternalHTTP.IdleTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Docker.Host = stringEnv(dockerHostEnvVar, cfg.Docker.Host)
|
||||
cfg.Docker.APIVersion = stringEnv(dockerAPIVersionEnvVar, cfg.Docker.APIVersion)
|
||||
cfg.Docker.Network = stringEnv(dockerNetworkEnvVar, cfg.Docker.Network)
|
||||
cfg.Docker.LogDriver = stringEnv(dockerLogDriverEnvVar, cfg.Docker.LogDriver)
|
||||
cfg.Docker.LogOpts = stringEnv(dockerLogOptsEnvVar, cfg.Docker.LogOpts)
|
||||
if raw, ok := os.LookupEnv(imagePullPolicyEnvVar); ok {
|
||||
cfg.Docker.PullPolicy = ImagePullPolicy(strings.TrimSpace(raw))
|
||||
}
|
||||
|
||||
pgConn, err := postgres.LoadFromEnv(envPrefix)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Postgres.Conn = pgConn
|
||||
|
||||
redisConn, err := redisconn.LoadFromEnv(envPrefix)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Redis.Conn = redisConn
|
||||
|
||||
cfg.Streams.StartJobs = stringEnv(startJobsStreamEnvVar, cfg.Streams.StartJobs)
|
||||
cfg.Streams.StopJobs = stringEnv(stopJobsStreamEnvVar, cfg.Streams.StopJobs)
|
||||
cfg.Streams.JobResults = stringEnv(jobResultsStreamEnvVar, cfg.Streams.JobResults)
|
||||
cfg.Streams.HealthEvents = stringEnv(healthEventsStreamEnvVar, cfg.Streams.HealthEvents)
|
||||
cfg.Streams.NotificationIntents = stringEnv(notificationIntentsStreamEnv, cfg.Streams.NotificationIntents)
|
||||
cfg.Streams.BlockTimeout, err = durationEnv(streamBlockTimeoutEnvVar, cfg.Streams.BlockTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Container.DefaultCPUQuota, err = floatEnv(defaultCPUQuotaEnvVar, cfg.Container.DefaultCPUQuota)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.DefaultMemory = stringEnv(defaultMemoryEnvVar, cfg.Container.DefaultMemory)
|
||||
cfg.Container.DefaultPIDsLimit, err = intEnv(defaultPIDsLimitEnvVar, cfg.Container.DefaultPIDsLimit)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.StopTimeout, err = secondsEnv(containerStopTimeoutSecondsEnvVar, cfg.Container.StopTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.Retention, err = daysEnv(containerRetentionDaysEnvVar, cfg.Container.Retention)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.EngineStateMountPath = stringEnv(engineStateMountPathEnvVar, cfg.Container.EngineStateMountPath)
|
||||
cfg.Container.EngineStateEnvName = stringEnv(engineStateEnvNameEnvVar, cfg.Container.EngineStateEnvName)
|
||||
cfg.Container.GameStateDirMode, err = octalUint32Env(gameStateDirModeEnvVar, cfg.Container.GameStateDirMode)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.GameStateOwnerUID, err = intEnv(gameStateOwnerUIDEnvVar, cfg.Container.GameStateOwnerUID)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Container.GameStateOwnerGID, err = intEnv(gameStateOwnerGIDEnvVar, cfg.Container.GameStateOwnerGID)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
root, ok := os.LookupEnv(gameStateRootEnvVar)
|
||||
if !ok || strings.TrimSpace(root) == "" {
|
||||
return Config{}, fmt.Errorf("%s must be set", gameStateRootEnvVar)
|
||||
}
|
||||
cfg.Container.GameStateRoot = strings.TrimSpace(root)
|
||||
|
||||
cfg.Health.InspectInterval, err = durationEnv(inspectIntervalEnvVar, cfg.Health.InspectInterval)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Health.ProbeInterval, err = durationEnv(probeIntervalEnvVar, cfg.Health.ProbeInterval)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Health.ProbeTimeout, err = durationEnv(probeTimeoutEnvVar, cfg.Health.ProbeTimeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Health.ProbeFailuresThreshold, err = intEnv(probeFailuresThresholdEnvVar, cfg.Health.ProbeFailuresThreshold)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Cleanup.ReconcileInterval, err = durationEnv(reconcileIntervalEnvVar, cfg.Cleanup.ReconcileInterval)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Cleanup.CleanupInterval, err = durationEnv(cleanupIntervalEnvVar, cfg.Cleanup.CleanupInterval)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Coordination.GameLeaseTTL, err = secondsEnv(gameLeaseTTLSecondsEnvVar, cfg.Coordination.GameLeaseTTL)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
lobbyURL, ok := os.LookupEnv(lobbyInternalBaseURLEnvVar)
|
||||
if !ok || strings.TrimSpace(lobbyURL) == "" {
|
||||
return Config{}, fmt.Errorf("%s must be set", lobbyInternalBaseURLEnvVar)
|
||||
}
|
||||
cfg.Lobby.BaseURL = strings.TrimSpace(lobbyURL)
|
||||
cfg.Lobby.Timeout, err = durationEnv(lobbyInternalTimeoutEnvVar, cfg.Lobby.Timeout)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.Telemetry.ServiceName = stringEnv(otelServiceNameEnvVar, cfg.Telemetry.ServiceName)
|
||||
cfg.Telemetry.TracesExporter = normalizeExporterValue(stringEnv(otelTracesExporterEnvVar, cfg.Telemetry.TracesExporter))
|
||||
cfg.Telemetry.MetricsExporter = normalizeExporterValue(stringEnv(otelMetricsExporterEnvVar, cfg.Telemetry.MetricsExporter))
|
||||
cfg.Telemetry.TracesProtocol = normalizeProtocolValue(
|
||||
os.Getenv(otelExporterOTLPTracesProtocolEnvVar),
|
||||
os.Getenv(otelExporterOTLPProtocolEnvVar),
|
||||
cfg.Telemetry.TracesProtocol,
|
||||
)
|
||||
cfg.Telemetry.MetricsProtocol = normalizeProtocolValue(
|
||||
os.Getenv(otelExporterOTLPMetricsProtocolEnvVar),
|
||||
os.Getenv(otelExporterOTLPProtocolEnvVar),
|
||||
cfg.Telemetry.MetricsProtocol,
|
||||
)
|
||||
cfg.Telemetry.StdoutTracesEnabled, err = boolEnv(otelStdoutTracesEnabledEnvVar, cfg.Telemetry.StdoutTracesEnabled)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.Telemetry.StdoutMetricsEnabled, err = boolEnv(otelStdoutMetricsEnabledEnvVar, cfg.Telemetry.StdoutMetricsEnabled)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func stringEnv(name string, fallback string) string {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback
|
||||
}
|
||||
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
|
||||
func durationEnv(name string, fallback time.Duration) (time.Duration, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := time.ParseDuration(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse duration: %w", name, err)
|
||||
}
|
||||
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
func secondsEnv(name string, fallback time.Duration) (time.Duration, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse seconds: %w", name, err)
|
||||
}
|
||||
if parsed <= 0 {
|
||||
return 0, fmt.Errorf("%s: must be positive", name)
|
||||
}
|
||||
|
||||
return time.Duration(parsed) * time.Second, nil
|
||||
}
|
||||
|
||||
func daysEnv(name string, fallback time.Duration) (time.Duration, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse days: %w", name, err)
|
||||
}
|
||||
if parsed <= 0 {
|
||||
return 0, fmt.Errorf("%s: must be positive", name)
|
||||
}
|
||||
|
||||
return time.Duration(parsed) * 24 * time.Hour, nil
|
||||
}
|
||||
|
||||
func intEnv(name string, fallback int) (int, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse int: %w", name, err)
|
||||
}
|
||||
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
func floatEnv(name string, fallback float64) (float64, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.ParseFloat(strings.TrimSpace(value), 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse float: %w", name, err)
|
||||
}
|
||||
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
func boolEnv(name string, fallback bool) (bool, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.ParseBool(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("%s: parse bool: %w", name, err)
|
||||
}
|
||||
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
func octalUint32Env(name string, fallback uint32) (uint32, error) {
|
||||
value, ok := os.LookupEnv(name)
|
||||
if !ok {
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
parsed, err := strconv.ParseUint(strings.TrimSpace(value), 8, 32)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("%s: parse octal: %w", name, err)
|
||||
}
|
||||
|
||||
return uint32(parsed), nil
|
||||
}
|
||||
|
||||
func normalizeExporterValue(value string) string {
|
||||
trimmed := strings.TrimSpace(value)
|
||||
switch trimmed {
|
||||
case "", "none":
|
||||
return "none"
|
||||
default:
|
||||
return trimmed
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeProtocolValue(primary string, fallback string, defaultValue string) string {
|
||||
primary = strings.TrimSpace(primary)
|
||||
if primary != "" {
|
||||
return primary
|
||||
}
|
||||
|
||||
fallback = strings.TrimSpace(fallback)
|
||||
if fallback != "" {
|
||||
return fallback
|
||||
}
|
||||
|
||||
return strings.TrimSpace(defaultValue)
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Validate reports whether cfg stores a usable Runtime Manager process
|
||||
// configuration.
|
||||
func (cfg Config) Validate() error {
|
||||
if cfg.ShutdownTimeout <= 0 {
|
||||
return fmt.Errorf("%s must be positive", shutdownTimeoutEnvVar)
|
||||
}
|
||||
if err := validateSlogLevel(cfg.Logging.Level); err != nil {
|
||||
return fmt.Errorf("%s: %w", logLevelEnvVar, err)
|
||||
}
|
||||
if err := cfg.InternalHTTP.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Docker.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Postgres.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Redis.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Streams.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Container.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Health.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Cleanup.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Coordination.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Lobby.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cfg.Telemetry.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateSlogLevel(level string) error {
|
||||
var slogLevel slog.Level
|
||||
if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil {
|
||||
return fmt.Errorf("invalid slog level %q: %w", level, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func isTCPAddr(value string) bool {
|
||||
host, port, err := net.SplitHostPort(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if port == "" {
|
||||
return false
|
||||
}
|
||||
if host == "" {
|
||||
return true
|
||||
}
|
||||
|
||||
return !strings.Contains(host, " ")
|
||||
}
|
||||
|
||||
func isHTTPURL(value string) bool {
|
||||
parsed, err := url.Parse(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if parsed.Scheme != "http" && parsed.Scheme != "https" {
|
||||
return false
|
||||
}
|
||||
|
||||
return parsed.Host != ""
|
||||
}
|
||||
@@ -0,0 +1,231 @@
|
||||
// Package health defines the technical-health domain types owned by
|
||||
// Runtime Manager.
|
||||
//
|
||||
// EventType matches the `event_type` enum frozen in
|
||||
// `galaxy/rtmanager/api/runtime-health-asyncapi.yaml`. SnapshotStatus
|
||||
// matches the SQL CHECK on `health_snapshots.status` and is intentionally
|
||||
// narrower than EventType (the snapshot table collapses
|
||||
// `container_started → healthy` and drops `probe_recovered` per
|
||||
// `galaxy/rtmanager/README.md §Health Monitoring`).
|
||||
package health
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// EventType identifies one entry on the `runtime:health_events` Redis
|
||||
// Stream. Used by the health-event publishers and consumers.
|
||||
type EventType string
|
||||
|
||||
const (
|
||||
// EventTypeContainerStarted reports a successful container start.
|
||||
EventTypeContainerStarted EventType = "container_started"
|
||||
|
||||
// EventTypeContainerExited reports a non-zero Docker `die` event.
|
||||
EventTypeContainerExited EventType = "container_exited"
|
||||
|
||||
// EventTypeContainerOOM reports a Docker `oom` event.
|
||||
EventTypeContainerOOM EventType = "container_oom"
|
||||
|
||||
// EventTypeContainerDisappeared reports that the listener observed
|
||||
// a `destroy` event for a record Runtime Manager did not initiate.
|
||||
EventTypeContainerDisappeared EventType = "container_disappeared"
|
||||
|
||||
// EventTypeInspectUnhealthy reports an unexpected outcome of the
|
||||
// periodic Docker inspect (RestartCount growth, unexpected status,
|
||||
// declared HEALTHCHECK reporting unhealthy).
|
||||
EventTypeInspectUnhealthy EventType = "inspect_unhealthy"
|
||||
|
||||
// EventTypeProbeFailed reports that the active HTTP probe crossed
|
||||
// the configured failure threshold.
|
||||
EventTypeProbeFailed EventType = "probe_failed"
|
||||
|
||||
// EventTypeProbeRecovered reports the first probe success after a
|
||||
// `probe_failed` event was published.
|
||||
EventTypeProbeRecovered EventType = "probe_recovered"
|
||||
)
|
||||
|
||||
// IsKnown reports whether eventType belongs to the frozen event-type
|
||||
// vocabulary.
|
||||
func (eventType EventType) IsKnown() bool {
|
||||
switch eventType {
|
||||
case EventTypeContainerStarted,
|
||||
EventTypeContainerExited,
|
||||
EventTypeContainerOOM,
|
||||
EventTypeContainerDisappeared,
|
||||
EventTypeInspectUnhealthy,
|
||||
EventTypeProbeFailed,
|
||||
EventTypeProbeRecovered:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllEventTypes returns the frozen list of every event-type value.
|
||||
func AllEventTypes() []EventType {
|
||||
return []EventType{
|
||||
EventTypeContainerStarted,
|
||||
EventTypeContainerExited,
|
||||
EventTypeContainerOOM,
|
||||
EventTypeContainerDisappeared,
|
||||
EventTypeInspectUnhealthy,
|
||||
EventTypeProbeFailed,
|
||||
EventTypeProbeRecovered,
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotStatus identifies one latest-observation status value stored
|
||||
// in the `health_snapshots.status` column. Distinct from EventType: the
|
||||
// table collapses `container_started → healthy` and never persists
|
||||
// `probe_recovered` (it is conveyed only as a `runtime:health_events`
|
||||
// entry with status=healthy in the next observation).
|
||||
type SnapshotStatus string
|
||||
|
||||
const (
|
||||
// SnapshotStatusHealthy reports that the most recent observation
|
||||
// found the container live and the engine probe responsive.
|
||||
SnapshotStatusHealthy SnapshotStatus = "healthy"
|
||||
|
||||
// SnapshotStatusProbeFailed reports that the active probe crossed
|
||||
// the failure threshold.
|
||||
SnapshotStatusProbeFailed SnapshotStatus = "probe_failed"
|
||||
|
||||
// SnapshotStatusExited reports that the container exited.
|
||||
SnapshotStatusExited SnapshotStatus = "exited"
|
||||
|
||||
// SnapshotStatusOOM reports that the container was killed by the
|
||||
// OOM killer.
|
||||
SnapshotStatusOOM SnapshotStatus = "oom"
|
||||
|
||||
// SnapshotStatusInspectUnhealthy reports that the periodic inspect
|
||||
// observed an unexpected state.
|
||||
SnapshotStatusInspectUnhealthy SnapshotStatus = "inspect_unhealthy"
|
||||
|
||||
// SnapshotStatusContainerDisappeared reports that Docker no longer
|
||||
// reports the container.
|
||||
SnapshotStatusContainerDisappeared SnapshotStatus = "container_disappeared"
|
||||
)
|
||||
|
||||
// IsKnown reports whether status belongs to the frozen snapshot-status
|
||||
// vocabulary.
|
||||
func (status SnapshotStatus) IsKnown() bool {
|
||||
switch status {
|
||||
case SnapshotStatusHealthy,
|
||||
SnapshotStatusProbeFailed,
|
||||
SnapshotStatusExited,
|
||||
SnapshotStatusOOM,
|
||||
SnapshotStatusInspectUnhealthy,
|
||||
SnapshotStatusContainerDisappeared:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllSnapshotStatuses returns the frozen list of every snapshot-status
|
||||
// value.
|
||||
func AllSnapshotStatuses() []SnapshotStatus {
|
||||
return []SnapshotStatus{
|
||||
SnapshotStatusHealthy,
|
||||
SnapshotStatusProbeFailed,
|
||||
SnapshotStatusExited,
|
||||
SnapshotStatusOOM,
|
||||
SnapshotStatusInspectUnhealthy,
|
||||
SnapshotStatusContainerDisappeared,
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotSource identifies the observation source that produced one
|
||||
// snapshot. Matches the SQL CHECK on `health_snapshots.source`.
|
||||
type SnapshotSource string
|
||||
|
||||
const (
|
||||
// SnapshotSourceDockerEvent reports that the latest observation
|
||||
// arrived through the Docker events listener.
|
||||
SnapshotSourceDockerEvent SnapshotSource = "docker_event"
|
||||
|
||||
// SnapshotSourceInspect reports that the latest observation arrived
|
||||
// through the periodic Docker inspect worker.
|
||||
SnapshotSourceInspect SnapshotSource = "inspect"
|
||||
|
||||
// SnapshotSourceProbe reports that the latest observation arrived
|
||||
// through the active HTTP probe.
|
||||
SnapshotSourceProbe SnapshotSource = "probe"
|
||||
)
|
||||
|
||||
// IsKnown reports whether source belongs to the frozen snapshot-source
|
||||
// vocabulary.
|
||||
func (source SnapshotSource) IsKnown() bool {
|
||||
switch source {
|
||||
case SnapshotSourceDockerEvent,
|
||||
SnapshotSourceInspect,
|
||||
SnapshotSourceProbe:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllSnapshotSources returns the frozen list of every snapshot-source
|
||||
// value.
|
||||
func AllSnapshotSources() []SnapshotSource {
|
||||
return []SnapshotSource{
|
||||
SnapshotSourceDockerEvent,
|
||||
SnapshotSourceInspect,
|
||||
SnapshotSourceProbe,
|
||||
}
|
||||
}
|
||||
|
||||
// HealthSnapshot stores the latest technical-health observation for one
|
||||
// game. One row per game_id; later observations overwrite.
|
||||
type HealthSnapshot struct {
|
||||
// GameID identifies the platform game.
|
||||
GameID string
|
||||
|
||||
// ContainerID stores the Docker container id observed by the
|
||||
// snapshot source. Empty when the source could not associate a
|
||||
// container (e.g., reconciler dispose for a record whose container
|
||||
// is already gone).
|
||||
ContainerID string
|
||||
|
||||
// Status stores the latest observed snapshot status.
|
||||
Status SnapshotStatus
|
||||
|
||||
// Source stores the observation source that produced this entry.
|
||||
Source SnapshotSource
|
||||
|
||||
// Details stores the source-specific JSON detail payload. Adapters
|
||||
// store and retrieve it verbatim. Empty / nil values are persisted
|
||||
// as the SQL default `{}`.
|
||||
Details json.RawMessage
|
||||
|
||||
// ObservedAt stores the wall-clock at which the source captured the
|
||||
// observation.
|
||||
ObservedAt time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether snapshot satisfies the snapshot invariants
|
||||
// implied by the SQL CHECK constraints.
|
||||
func (snapshot HealthSnapshot) Validate() error {
|
||||
if strings.TrimSpace(snapshot.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !snapshot.Status.IsKnown() {
|
||||
return fmt.Errorf("status %q is unsupported", snapshot.Status)
|
||||
}
|
||||
if !snapshot.Source.IsKnown() {
|
||||
return fmt.Errorf("source %q is unsupported", snapshot.Source)
|
||||
}
|
||||
if snapshot.ObservedAt.IsZero() {
|
||||
return fmt.Errorf("observed at must not be zero")
|
||||
}
|
||||
if len(snapshot.Details) > 0 && !json.Valid(snapshot.Details) {
|
||||
return fmt.Errorf("details must be valid JSON when non-empty")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestEventTypeIsKnown(t *testing.T) {
|
||||
for _, eventType := range AllEventTypes() {
|
||||
assert.Truef(t, eventType.IsKnown(), "expected %q known", eventType)
|
||||
}
|
||||
|
||||
assert.False(t, EventType("").IsKnown())
|
||||
assert.False(t, EventType("paused").IsKnown())
|
||||
}
|
||||
|
||||
func TestAllEventTypesCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]EventType{
|
||||
EventTypeContainerStarted,
|
||||
EventTypeContainerExited,
|
||||
EventTypeContainerOOM,
|
||||
EventTypeContainerDisappeared,
|
||||
EventTypeInspectUnhealthy,
|
||||
EventTypeProbeFailed,
|
||||
EventTypeProbeRecovered,
|
||||
},
|
||||
AllEventTypes(),
|
||||
)
|
||||
}
|
||||
|
||||
func TestSnapshotStatusIsKnown(t *testing.T) {
|
||||
for _, status := range AllSnapshotStatuses() {
|
||||
assert.Truef(t, status.IsKnown(), "expected %q known", status)
|
||||
}
|
||||
|
||||
assert.False(t, SnapshotStatus("").IsKnown())
|
||||
assert.False(t, SnapshotStatus("starting").IsKnown())
|
||||
assert.False(t, SnapshotStatus("probe_recovered").IsKnown(),
|
||||
"snapshot status must not include event-only values")
|
||||
assert.False(t, SnapshotStatus("container_started").IsKnown(),
|
||||
"snapshot status must not include event-only values")
|
||||
}
|
||||
|
||||
func TestAllSnapshotStatusesCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]SnapshotStatus{
|
||||
SnapshotStatusHealthy,
|
||||
SnapshotStatusProbeFailed,
|
||||
SnapshotStatusExited,
|
||||
SnapshotStatusOOM,
|
||||
SnapshotStatusInspectUnhealthy,
|
||||
SnapshotStatusContainerDisappeared,
|
||||
},
|
||||
AllSnapshotStatuses(),
|
||||
)
|
||||
}
|
||||
|
||||
func TestSnapshotSourceIsKnown(t *testing.T) {
|
||||
for _, source := range AllSnapshotSources() {
|
||||
assert.Truef(t, source.IsKnown(), "expected %q known", source)
|
||||
}
|
||||
|
||||
assert.False(t, SnapshotSource("").IsKnown())
|
||||
assert.False(t, SnapshotSource("manual").IsKnown())
|
||||
}
|
||||
|
||||
func TestAllSnapshotSourcesCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]SnapshotSource{
|
||||
SnapshotSourceDockerEvent,
|
||||
SnapshotSourceInspect,
|
||||
SnapshotSourceProbe,
|
||||
},
|
||||
AllSnapshotSources(),
|
||||
)
|
||||
}
|
||||
|
||||
func sampleSnapshot() HealthSnapshot {
|
||||
return HealthSnapshot{
|
||||
GameID: "game-test",
|
||||
ContainerID: "container-1",
|
||||
Status: SnapshotStatusHealthy,
|
||||
Source: SnapshotSourceProbe,
|
||||
Details: json.RawMessage(`{"prior_failure_count":0}`),
|
||||
ObservedAt: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthSnapshotValidateHappy(t *testing.T) {
|
||||
require.NoError(t, sampleSnapshot().Validate())
|
||||
}
|
||||
|
||||
func TestHealthSnapshotValidateAcceptsEmptyDetails(t *testing.T) {
|
||||
snapshot := sampleSnapshot()
|
||||
snapshot.Details = nil
|
||||
|
||||
assert.NoError(t, snapshot.Validate())
|
||||
}
|
||||
|
||||
func TestHealthSnapshotValidateAcceptsEmptyContainerID(t *testing.T) {
|
||||
snapshot := sampleSnapshot()
|
||||
snapshot.ContainerID = ""
|
||||
|
||||
assert.NoError(t, snapshot.Validate())
|
||||
}
|
||||
|
||||
func TestHealthSnapshotValidateRejects(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*HealthSnapshot)
|
||||
}{
|
||||
{"empty game id", func(s *HealthSnapshot) { s.GameID = "" }},
|
||||
{"unknown status", func(s *HealthSnapshot) { s.Status = "exotic" }},
|
||||
{"unknown source", func(s *HealthSnapshot) { s.Source = "exotic" }},
|
||||
{"zero observed at", func(s *HealthSnapshot) { s.ObservedAt = time.Time{} }},
|
||||
{"invalid details json", func(s *HealthSnapshot) {
|
||||
s.Details = json.RawMessage("not-json")
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
snapshot := sampleSnapshot()
|
||||
tt.mutate(&snapshot)
|
||||
assert.Error(t, snapshot.Validate())
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
// Package operation defines the runtime-operation audit-log domain types
|
||||
// owned by Runtime Manager.
|
||||
//
|
||||
// One OperationEntry maps to one row of the `operation_log` PostgreSQL
|
||||
// table (see
|
||||
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
|
||||
// The OpKind / OpSource / Outcome enums match the SQL CHECK constraints
|
||||
// verbatim and feed the telemetry counters declared in
|
||||
// `galaxy/rtmanager/README.md §Observability`.
|
||||
package operation
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// OpKind identifies the kind of operation Runtime Manager performed.
|
||||
type OpKind string
|
||||
|
||||
const (
|
||||
// OpKindStart records a start lifecycle operation.
|
||||
OpKindStart OpKind = "start"
|
||||
|
||||
// OpKindStop records a stop lifecycle operation.
|
||||
OpKindStop OpKind = "stop"
|
||||
|
||||
// OpKindRestart records a restart lifecycle operation
|
||||
// (recreate with the same image_ref).
|
||||
OpKindRestart OpKind = "restart"
|
||||
|
||||
// OpKindPatch records a semver-patch lifecycle operation
|
||||
// (recreate with a new image_ref).
|
||||
OpKindPatch OpKind = "patch"
|
||||
|
||||
// OpKindCleanupContainer records a container removal performed by
|
||||
// the cleanup TTL worker or the admin DELETE endpoint.
|
||||
OpKindCleanupContainer OpKind = "cleanup_container"
|
||||
|
||||
// OpKindReconcileAdopt records that the reconciler discovered an
|
||||
// unrecorded container labelled `com.galaxy.owner=rtmanager` and
|
||||
// inserted a runtime record for it.
|
||||
OpKindReconcileAdopt OpKind = "reconcile_adopt"
|
||||
|
||||
// OpKindReconcileDispose records that the reconciler observed a
|
||||
// running record whose container is missing in Docker and marked it
|
||||
// as removed.
|
||||
OpKindReconcileDispose OpKind = "reconcile_dispose"
|
||||
)
|
||||
|
||||
// IsKnown reports whether kind belongs to the frozen op-kind vocabulary.
|
||||
func (kind OpKind) IsKnown() bool {
|
||||
switch kind {
|
||||
case OpKindStart,
|
||||
OpKindStop,
|
||||
OpKindRestart,
|
||||
OpKindPatch,
|
||||
OpKindCleanupContainer,
|
||||
OpKindReconcileAdopt,
|
||||
OpKindReconcileDispose:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllOpKinds returns the frozen list of every op-kind value. The slice
|
||||
// order is stable across calls.
|
||||
func AllOpKinds() []OpKind {
|
||||
return []OpKind{
|
||||
OpKindStart,
|
||||
OpKindStop,
|
||||
OpKindRestart,
|
||||
OpKindPatch,
|
||||
OpKindCleanupContainer,
|
||||
OpKindReconcileAdopt,
|
||||
OpKindReconcileDispose,
|
||||
}
|
||||
}
|
||||
|
||||
// OpSource identifies where one operation entered Runtime Manager.
|
||||
type OpSource string
|
||||
|
||||
const (
|
||||
// OpSourceLobbyStream identifies entries triggered by the
|
||||
// `runtime:start_jobs` or `runtime:stop_jobs` Redis Stream consumer.
|
||||
OpSourceLobbyStream OpSource = "lobby_stream"
|
||||
|
||||
// OpSourceGMRest identifies entries triggered by Game Master through
|
||||
// the internal REST surface.
|
||||
OpSourceGMRest OpSource = "gm_rest"
|
||||
|
||||
// OpSourceAdminRest identifies entries triggered by Admin Service
|
||||
// through the internal REST surface.
|
||||
OpSourceAdminRest OpSource = "admin_rest"
|
||||
|
||||
// OpSourceAutoTTL identifies entries triggered by the periodic
|
||||
// container-cleanup worker.
|
||||
OpSourceAutoTTL OpSource = "auto_ttl"
|
||||
|
||||
// OpSourceAutoReconcile identifies entries triggered by the
|
||||
// reconciler at startup or on its periodic interval.
|
||||
OpSourceAutoReconcile OpSource = "auto_reconcile"
|
||||
)
|
||||
|
||||
// IsKnown reports whether source belongs to the frozen op-source
|
||||
// vocabulary.
|
||||
func (source OpSource) IsKnown() bool {
|
||||
switch source {
|
||||
case OpSourceLobbyStream,
|
||||
OpSourceGMRest,
|
||||
OpSourceAdminRest,
|
||||
OpSourceAutoTTL,
|
||||
OpSourceAutoReconcile:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllOpSources returns the frozen list of every op-source value. The
|
||||
// slice order is stable across calls.
|
||||
func AllOpSources() []OpSource {
|
||||
return []OpSource{
|
||||
OpSourceLobbyStream,
|
||||
OpSourceGMRest,
|
||||
OpSourceAdminRest,
|
||||
OpSourceAutoTTL,
|
||||
OpSourceAutoReconcile,
|
||||
}
|
||||
}
|
||||
|
||||
// Outcome reports the high-level outcome of one operation.
|
||||
type Outcome string
|
||||
|
||||
const (
|
||||
// OutcomeSuccess reports that the operation completed without
|
||||
// surfacing an error.
|
||||
OutcomeSuccess Outcome = "success"
|
||||
|
||||
// OutcomeFailure reports that the operation surfaced a stable error
|
||||
// code recorded in OperationEntry.ErrorCode.
|
||||
OutcomeFailure Outcome = "failure"
|
||||
)
|
||||
|
||||
// IsKnown reports whether outcome belongs to the frozen outcome
|
||||
// vocabulary.
|
||||
func (outcome Outcome) IsKnown() bool {
|
||||
switch outcome {
|
||||
case OutcomeSuccess, OutcomeFailure:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllOutcomes returns the frozen list of every outcome value.
|
||||
func AllOutcomes() []Outcome {
|
||||
return []Outcome{OutcomeSuccess, OutcomeFailure}
|
||||
}
|
||||
|
||||
// OperationEntry stores one append-only audit row of the `operation_log`
|
||||
// table. ID is zero on records that have not been persisted yet; the
|
||||
// store assigns it from the table's bigserial column. FinishedAt is a
|
||||
// pointer because the column is nullable for in-flight rows even though
|
||||
// the lifecycle services finalise the row in the same transaction.
|
||||
type OperationEntry struct {
|
||||
// ID identifies the persisted row. Zero before persistence.
|
||||
ID int64
|
||||
|
||||
// GameID identifies the platform game this operation acted on.
|
||||
GameID string
|
||||
|
||||
// OpKind classifies what the operation did.
|
||||
OpKind OpKind
|
||||
|
||||
// OpSource classifies how the operation entered Runtime Manager.
|
||||
OpSource OpSource
|
||||
|
||||
// SourceRef stores an opaque per-source reference such as a Redis
|
||||
// Stream entry id, a REST request id, or an admin user id. Empty
|
||||
// when the source does not provide one.
|
||||
SourceRef string
|
||||
|
||||
// ImageRef stores the engine image reference associated with the
|
||||
// operation, when applicable. Empty for operations that do not
|
||||
// touch an image (e.g., cleanup_container).
|
||||
ImageRef string
|
||||
|
||||
// ContainerID stores the Docker container id observed at the time
|
||||
// of the operation, when applicable.
|
||||
ContainerID string
|
||||
|
||||
// Outcome reports whether the operation succeeded or failed.
|
||||
Outcome Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty on
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty on success.
|
||||
ErrorMessage string
|
||||
|
||||
// StartedAt stores the wall-clock at which the operation began.
|
||||
StartedAt time.Time
|
||||
|
||||
// FinishedAt stores the wall-clock at which the operation
|
||||
// finalised. Nil for in-flight rows.
|
||||
FinishedAt *time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether entry satisfies the operation-log invariants
|
||||
// implied by the SQL CHECK constraints and the README §Persistence
|
||||
// Layout.
|
||||
func (entry OperationEntry) Validate() error {
|
||||
if strings.TrimSpace(entry.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !entry.OpKind.IsKnown() {
|
||||
return fmt.Errorf("op kind %q is unsupported", entry.OpKind)
|
||||
}
|
||||
if !entry.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", entry.OpSource)
|
||||
}
|
||||
if !entry.Outcome.IsKnown() {
|
||||
return fmt.Errorf("outcome %q is unsupported", entry.Outcome)
|
||||
}
|
||||
if entry.StartedAt.IsZero() {
|
||||
return fmt.Errorf("started at must not be zero")
|
||||
}
|
||||
if entry.FinishedAt != nil {
|
||||
if entry.FinishedAt.IsZero() {
|
||||
return fmt.Errorf("finished at must not be zero when present")
|
||||
}
|
||||
if entry.FinishedAt.Before(entry.StartedAt) {
|
||||
return fmt.Errorf("finished at must not be before started at")
|
||||
}
|
||||
}
|
||||
if entry.Outcome == OutcomeFailure && strings.TrimSpace(entry.ErrorCode) == "" {
|
||||
return fmt.Errorf("error code must not be empty for failure entries")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package operation
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestOpKindIsKnown(t *testing.T) {
|
||||
for _, kind := range AllOpKinds() {
|
||||
assert.Truef(t, kind.IsKnown(), "expected %q known", kind)
|
||||
}
|
||||
|
||||
assert.False(t, OpKind("").IsKnown())
|
||||
assert.False(t, OpKind("rollback").IsKnown())
|
||||
}
|
||||
|
||||
func TestAllOpKindsCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]OpKind{
|
||||
OpKindStart, OpKindStop, OpKindRestart, OpKindPatch,
|
||||
OpKindCleanupContainer, OpKindReconcileAdopt, OpKindReconcileDispose,
|
||||
},
|
||||
AllOpKinds(),
|
||||
)
|
||||
}
|
||||
|
||||
func TestOpSourceIsKnown(t *testing.T) {
|
||||
for _, source := range AllOpSources() {
|
||||
assert.Truef(t, source.IsKnown(), "expected %q known", source)
|
||||
}
|
||||
|
||||
assert.False(t, OpSource("").IsKnown())
|
||||
assert.False(t, OpSource("manual").IsKnown())
|
||||
}
|
||||
|
||||
func TestAllOpSourcesCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]OpSource{
|
||||
OpSourceLobbyStream, OpSourceGMRest, OpSourceAdminRest,
|
||||
OpSourceAutoTTL, OpSourceAutoReconcile,
|
||||
},
|
||||
AllOpSources(),
|
||||
)
|
||||
}
|
||||
|
||||
func TestOutcomeIsKnown(t *testing.T) {
|
||||
for _, outcome := range AllOutcomes() {
|
||||
assert.Truef(t, outcome.IsKnown(), "expected %q known", outcome)
|
||||
}
|
||||
|
||||
assert.False(t, Outcome("").IsKnown())
|
||||
assert.False(t, Outcome("partial").IsKnown())
|
||||
}
|
||||
|
||||
func TestAllOutcomesCoverFrozenSet(t *testing.T) {
|
||||
assert.ElementsMatch(t,
|
||||
[]Outcome{OutcomeSuccess, OutcomeFailure},
|
||||
AllOutcomes(),
|
||||
)
|
||||
}
|
||||
|
||||
func successEntry() OperationEntry {
|
||||
started := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
finished := started.Add(time.Second)
|
||||
return OperationEntry{
|
||||
GameID: "game-test",
|
||||
OpKind: OpKindStart,
|
||||
OpSource: OpSourceLobbyStream,
|
||||
SourceRef: "1700000000000-0",
|
||||
ImageRef: "galaxy/game:1.0.0",
|
||||
ContainerID: "container-1",
|
||||
Outcome: OutcomeSuccess,
|
||||
StartedAt: started,
|
||||
FinishedAt: &finished,
|
||||
}
|
||||
}
|
||||
|
||||
func TestOperationEntryValidateHappy(t *testing.T) {
|
||||
require.NoError(t, successEntry().Validate())
|
||||
}
|
||||
|
||||
func TestOperationEntryValidateAcceptsReplayNoOp(t *testing.T) {
|
||||
entry := successEntry()
|
||||
entry.ErrorCode = "replay_no_op"
|
||||
|
||||
assert.NoError(t, entry.Validate())
|
||||
}
|
||||
|
||||
func TestOperationEntryValidateAcceptsInFlight(t *testing.T) {
|
||||
entry := successEntry()
|
||||
entry.FinishedAt = nil
|
||||
|
||||
assert.NoError(t, entry.Validate())
|
||||
}
|
||||
|
||||
func TestOperationEntryValidateRejects(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*OperationEntry)
|
||||
}{
|
||||
{"empty game id", func(e *OperationEntry) { e.GameID = "" }},
|
||||
{"unknown op kind", func(e *OperationEntry) { e.OpKind = "exotic" }},
|
||||
{"unknown op source", func(e *OperationEntry) { e.OpSource = "exotic" }},
|
||||
{"unknown outcome", func(e *OperationEntry) { e.Outcome = "partial" }},
|
||||
{"zero started at", func(e *OperationEntry) { e.StartedAt = time.Time{} }},
|
||||
{"zero finished at", func(e *OperationEntry) {
|
||||
zero := time.Time{}
|
||||
e.FinishedAt = &zero
|
||||
}},
|
||||
{"finished before started", func(e *OperationEntry) {
|
||||
before := e.StartedAt.Add(-time.Second)
|
||||
e.FinishedAt = &before
|
||||
}},
|
||||
{"failure without error code", func(e *OperationEntry) {
|
||||
e.Outcome = OutcomeFailure
|
||||
e.ErrorCode = ""
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
entry := successEntry()
|
||||
tt.mutate(&entry)
|
||||
assert.Error(t, entry.Validate())
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// ErrNotFound reports that a runtime record was requested but does not
|
||||
// exist in the store.
|
||||
var ErrNotFound = errors.New("runtime record not found")
|
||||
|
||||
// ErrConflict reports that a runtime mutation could not be applied
|
||||
// because the record changed concurrently or failed a compare-and-swap
|
||||
// guard.
|
||||
var ErrConflict = errors.New("runtime record conflict")
|
||||
|
||||
// ErrInvalidTransition is the sentinel returned when Transition rejects
|
||||
// a `(from, to)` pair.
|
||||
var ErrInvalidTransition = errors.New("invalid runtime status transition")
|
||||
|
||||
// InvalidTransitionError stores the rejected `(from, to)` pair and wraps
|
||||
// ErrInvalidTransition so callers can match it with errors.Is.
|
||||
type InvalidTransitionError struct {
|
||||
// From stores the source status that was attempted to leave.
|
||||
From Status
|
||||
|
||||
// To stores the destination status that was attempted to enter.
|
||||
To Status
|
||||
}
|
||||
|
||||
// Error reports a human-readable summary of the rejected pair.
|
||||
func (err *InvalidTransitionError) Error() string {
|
||||
return fmt.Sprintf(
|
||||
"invalid runtime status transition from %q to %q",
|
||||
err.From, err.To,
|
||||
)
|
||||
}
|
||||
|
||||
// Unwrap returns ErrInvalidTransition so errors.Is recognizes the
|
||||
// sentinel.
|
||||
func (err *InvalidTransitionError) Unwrap() error {
|
||||
return ErrInvalidTransition
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
// Package runtime defines the runtime-record domain model, status machine,
|
||||
// and sentinel errors owned by Runtime Manager.
|
||||
//
|
||||
// The package mirrors the durable shape of the `runtime_records`
|
||||
// PostgreSQL table (see
|
||||
// `galaxy/rtmanager/internal/adapters/postgres/migrations/00001_init.sql`).
|
||||
// Every status / transition / required-field rule already documented in
|
||||
// `galaxy/rtmanager/README.md` lives here as code so adapter and service
|
||||
// layers do not re-derive it.
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Status identifies one runtime-record lifecycle state.
|
||||
type Status string
|
||||
|
||||
const (
|
||||
// StatusRunning reports that an engine container is live and bound to
|
||||
// the record. The associated container id and image ref are non-empty
|
||||
// and StartedAt is set.
|
||||
StatusRunning Status = "running"
|
||||
|
||||
// StatusStopped reports that the engine container has exited (graceful
|
||||
// stop, observed Docker exit, or reconciled exit). The container is
|
||||
// still present in Docker until the cleanup worker removes it.
|
||||
StatusStopped Status = "stopped"
|
||||
|
||||
// StatusRemoved reports that the container has been removed from
|
||||
// Docker (admin cleanup or reconcile_dispose). The record stays in
|
||||
// PostgreSQL for audit; there is no transition out of this state.
|
||||
StatusRemoved Status = "removed"
|
||||
)
|
||||
|
||||
// IsKnown reports whether status belongs to the frozen runtime status
|
||||
// vocabulary.
|
||||
func (status Status) IsKnown() bool {
|
||||
switch status {
|
||||
case StatusRunning, StatusStopped, StatusRemoved:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// IsTerminal reports whether status can no longer accept lifecycle
|
||||
// transitions.
|
||||
func (status Status) IsTerminal() bool {
|
||||
return status == StatusRemoved
|
||||
}
|
||||
|
||||
// AllStatuses returns the frozen list of every runtime status value. The
|
||||
// slice order is stable across calls and matches the README §Persistence
|
||||
// Layout listing.
|
||||
func AllStatuses() []Status {
|
||||
return []Status{
|
||||
StatusRunning,
|
||||
StatusStopped,
|
||||
StatusRemoved,
|
||||
}
|
||||
}
|
||||
|
||||
// RuntimeRecord stores one durable runtime record owned by Runtime
|
||||
// Manager. It mirrors one row of the `runtime_records` table.
|
||||
//
|
||||
// CurrentContainerID and CurrentImageRef are stored as plain strings; an
|
||||
// empty value represents SQL NULL and is bridged at the adapter layer.
|
||||
// StartedAt, StoppedAt, and RemovedAt are *time.Time so a missing value
|
||||
// is unambiguous and aligns with the jet-generated model.
|
||||
type RuntimeRecord struct {
|
||||
// GameID identifies the platform game owning this runtime record.
|
||||
GameID string
|
||||
|
||||
// Status stores the current lifecycle state.
|
||||
Status Status
|
||||
|
||||
// CurrentContainerID identifies the bound Docker container. Empty
|
||||
// when status is removed and after a reconciler observes
|
||||
// disappearance.
|
||||
CurrentContainerID string
|
||||
|
||||
// CurrentImageRef stores the Docker reference of the currently-bound
|
||||
// engine image. Non-empty when status is running or stopped.
|
||||
CurrentImageRef string
|
||||
|
||||
// EngineEndpoint stores the stable URL Game Master uses to reach the
|
||||
// engine container, in `http://galaxy-game-{game_id}:8080` form.
|
||||
EngineEndpoint string
|
||||
|
||||
// StatePath stores the absolute host path of the bind-mounted engine
|
||||
// state directory.
|
||||
StatePath string
|
||||
|
||||
// DockerNetwork stores the Docker network the container was attached
|
||||
// to at create time.
|
||||
DockerNetwork string
|
||||
|
||||
// StartedAt stores the wall-clock at which the container became
|
||||
// running. Non-nil when status is running or stopped.
|
||||
StartedAt *time.Time
|
||||
|
||||
// StoppedAt stores the wall-clock at which the container exited.
|
||||
// Non-nil when status is stopped or removed (when the record passed
|
||||
// through stopped before removal).
|
||||
StoppedAt *time.Time
|
||||
|
||||
// RemovedAt stores the wall-clock at which the container was removed
|
||||
// from Docker. Non-nil when status is removed.
|
||||
RemovedAt *time.Time
|
||||
|
||||
// LastOpAt stores the wall-clock of the most recent operation
|
||||
// affecting this record. Drives the cleanup TTL.
|
||||
LastOpAt time.Time
|
||||
|
||||
// CreatedAt stores the wall-clock at which Runtime Manager first saw
|
||||
// this game.
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether record satisfies the runtime-record invariants
|
||||
// implied by README §Lifecycles and the SQL CHECK on `runtime_records`.
|
||||
func (record RuntimeRecord) Validate() error {
|
||||
if strings.TrimSpace(record.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !record.Status.IsKnown() {
|
||||
return fmt.Errorf("status %q is unsupported", record.Status)
|
||||
}
|
||||
if strings.TrimSpace(record.EngineEndpoint) == "" {
|
||||
return fmt.Errorf("engine endpoint must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(record.StatePath) == "" {
|
||||
return fmt.Errorf("state path must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(record.DockerNetwork) == "" {
|
||||
return fmt.Errorf("docker network must not be empty")
|
||||
}
|
||||
if record.LastOpAt.IsZero() {
|
||||
return fmt.Errorf("last op at must not be zero")
|
||||
}
|
||||
if record.CreatedAt.IsZero() {
|
||||
return fmt.Errorf("created at must not be zero")
|
||||
}
|
||||
if record.LastOpAt.Before(record.CreatedAt) {
|
||||
return fmt.Errorf("last op at must not be before created at")
|
||||
}
|
||||
|
||||
switch record.Status {
|
||||
case StatusRunning:
|
||||
if strings.TrimSpace(record.CurrentContainerID) == "" {
|
||||
return fmt.Errorf("current container id must not be empty for running records")
|
||||
}
|
||||
if strings.TrimSpace(record.CurrentImageRef) == "" {
|
||||
return fmt.Errorf("current image ref must not be empty for running records")
|
||||
}
|
||||
if record.StartedAt == nil {
|
||||
return fmt.Errorf("started at must not be nil for running records")
|
||||
}
|
||||
if record.StartedAt.IsZero() {
|
||||
return fmt.Errorf("started at must not be zero when present")
|
||||
}
|
||||
|
||||
case StatusStopped:
|
||||
if strings.TrimSpace(record.CurrentImageRef) == "" {
|
||||
return fmt.Errorf("current image ref must not be empty for stopped records")
|
||||
}
|
||||
if record.StoppedAt == nil {
|
||||
return fmt.Errorf("stopped at must not be nil for stopped records")
|
||||
}
|
||||
if record.StoppedAt.IsZero() {
|
||||
return fmt.Errorf("stopped at must not be zero when present")
|
||||
}
|
||||
|
||||
case StatusRemoved:
|
||||
if record.RemovedAt == nil {
|
||||
return fmt.Errorf("removed at must not be nil for removed records")
|
||||
}
|
||||
if record.RemovedAt.IsZero() {
|
||||
return fmt.Errorf("removed at must not be zero when present")
|
||||
}
|
||||
}
|
||||
|
||||
if record.StartedAt != nil && record.StartedAt.Before(record.CreatedAt) {
|
||||
return fmt.Errorf("started at must not be before created at")
|
||||
}
|
||||
if record.StoppedAt != nil && record.StartedAt != nil && record.StoppedAt.Before(*record.StartedAt) {
|
||||
return fmt.Errorf("stopped at must not be before started at")
|
||||
}
|
||||
if record.RemovedAt != nil && record.RemovedAt.Before(record.CreatedAt) {
|
||||
return fmt.Errorf("removed at must not be before created at")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestStatusIsKnown(t *testing.T) {
|
||||
for _, status := range AllStatuses() {
|
||||
assert.Truef(t, status.IsKnown(), "expected %q known", status)
|
||||
}
|
||||
|
||||
assert.False(t, Status("").IsKnown())
|
||||
assert.False(t, Status("unknown").IsKnown())
|
||||
}
|
||||
|
||||
func TestStatusIsTerminal(t *testing.T) {
|
||||
assert.True(t, StatusRemoved.IsTerminal())
|
||||
|
||||
for _, status := range []Status{StatusRunning, StatusStopped} {
|
||||
assert.Falsef(t, status.IsTerminal(), "expected %q non-terminal", status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllStatuses(t *testing.T) {
|
||||
statuses := AllStatuses()
|
||||
|
||||
assert.ElementsMatch(t,
|
||||
[]Status{StatusRunning, StatusStopped, StatusRemoved},
|
||||
statuses,
|
||||
)
|
||||
|
||||
statuses[0] = "tampered"
|
||||
assert.Equal(t, StatusRunning, AllStatuses()[0],
|
||||
"AllStatuses must return an independent slice")
|
||||
}
|
||||
|
||||
func runningRecord() RuntimeRecord {
|
||||
created := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC)
|
||||
started := created.Add(time.Second)
|
||||
return RuntimeRecord{
|
||||
GameID: "game-test",
|
||||
Status: StatusRunning,
|
||||
CurrentContainerID: "container-1",
|
||||
CurrentImageRef: "galaxy/game:1.0.0",
|
||||
EngineEndpoint: "http://galaxy-game-game-test:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-test",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &started,
|
||||
LastOpAt: started,
|
||||
CreatedAt: created,
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRunningHappy(t *testing.T) {
|
||||
require.NoError(t, runningRecord().Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateStoppedHappy(t *testing.T) {
|
||||
record := runningRecord()
|
||||
stopped := record.StartedAt.Add(time.Minute)
|
||||
record.Status = StatusStopped
|
||||
record.StoppedAt = &stopped
|
||||
record.LastOpAt = stopped
|
||||
|
||||
require.NoError(t, record.Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRemovedHappy(t *testing.T) {
|
||||
record := runningRecord()
|
||||
stopped := record.StartedAt.Add(time.Minute)
|
||||
removed := stopped.Add(time.Minute)
|
||||
record.Status = StatusRemoved
|
||||
record.StoppedAt = &stopped
|
||||
record.RemovedAt = &removed
|
||||
record.CurrentContainerID = ""
|
||||
record.LastOpAt = removed
|
||||
|
||||
require.NoError(t, record.Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRejects(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*RuntimeRecord)
|
||||
}{
|
||||
{"empty game id", func(r *RuntimeRecord) { r.GameID = "" }},
|
||||
{"unknown status", func(r *RuntimeRecord) { r.Status = "exotic" }},
|
||||
{"empty engine endpoint", func(r *RuntimeRecord) { r.EngineEndpoint = "" }},
|
||||
{"empty state path", func(r *RuntimeRecord) { r.StatePath = "" }},
|
||||
{"empty docker network", func(r *RuntimeRecord) { r.DockerNetwork = "" }},
|
||||
{"zero last op at", func(r *RuntimeRecord) { r.LastOpAt = time.Time{} }},
|
||||
{"zero created at", func(r *RuntimeRecord) { r.CreatedAt = time.Time{} }},
|
||||
{"last op at before created at", func(r *RuntimeRecord) {
|
||||
r.LastOpAt = r.CreatedAt.Add(-time.Second)
|
||||
}},
|
||||
{"running without container id", func(r *RuntimeRecord) {
|
||||
r.CurrentContainerID = ""
|
||||
}},
|
||||
{"running without image ref", func(r *RuntimeRecord) {
|
||||
r.CurrentImageRef = ""
|
||||
}},
|
||||
{"running without started at", func(r *RuntimeRecord) {
|
||||
r.StartedAt = nil
|
||||
}},
|
||||
{"started at before created at", func(r *RuntimeRecord) {
|
||||
before := r.CreatedAt.Add(-time.Second)
|
||||
r.StartedAt = &before
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
record := runningRecord()
|
||||
tt.mutate(&record)
|
||||
assert.Error(t, record.Validate())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRejectsStoppedWithoutStoppedAt(t *testing.T) {
|
||||
record := runningRecord()
|
||||
record.Status = StatusStopped
|
||||
record.StoppedAt = nil
|
||||
|
||||
assert.Error(t, record.Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRejectsStoppedBeforeStarted(t *testing.T) {
|
||||
record := runningRecord()
|
||||
stopped := record.StartedAt.Add(-time.Second)
|
||||
record.Status = StatusStopped
|
||||
record.StoppedAt = &stopped
|
||||
|
||||
assert.Error(t, record.Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRejectsRemovedWithoutRemovedAt(t *testing.T) {
|
||||
record := runningRecord()
|
||||
record.Status = StatusRemoved
|
||||
record.RemovedAt = nil
|
||||
|
||||
assert.Error(t, record.Validate())
|
||||
}
|
||||
|
||||
func TestRuntimeRecordValidateRejectsRemovedBeforeCreated(t *testing.T) {
|
||||
record := runningRecord()
|
||||
before := record.CreatedAt.Add(-time.Second)
|
||||
record.Status = StatusRemoved
|
||||
record.RemovedAt = &before
|
||||
|
||||
assert.Error(t, record.Validate())
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
package runtime
|
||||
|
||||
// transitionKey stores one `(from, to)` pair in the allowed-transitions
|
||||
// table.
|
||||
type transitionKey struct {
|
||||
from Status
|
||||
to Status
|
||||
}
|
||||
|
||||
// allowedTransitions stores the set of permitted `(from, to)` status
|
||||
// pairs. The four pairs mirror the lifecycle flows frozen in
|
||||
// `galaxy/rtmanager/README.md §Lifecycles`:
|
||||
//
|
||||
// - running → stopped: graceful stop, observed Docker exit, or
|
||||
// reconcile observing an exited container.
|
||||
// - running → removed: reconcile_dispose when Docker no longer reports
|
||||
// the container at all.
|
||||
// - stopped → running: restart and patch inner start steps.
|
||||
// - stopped → removed: cleanup_container, both the periodic TTL worker
|
||||
// and the admin DELETE endpoint.
|
||||
var allowedTransitions = map[transitionKey]struct{}{
|
||||
{StatusRunning, StatusStopped}: {},
|
||||
{StatusRunning, StatusRemoved}: {},
|
||||
{StatusStopped, StatusRunning}: {},
|
||||
{StatusStopped, StatusRemoved}: {},
|
||||
}
|
||||
|
||||
// AllowedTransitions returns a copy of the `(from, to)` allowed
|
||||
// transitions table used by Transition. The returned map is safe to
|
||||
// mutate; callers should not rely on iteration order.
|
||||
func AllowedTransitions() map[Status][]Status {
|
||||
result := make(map[Status][]Status)
|
||||
for key := range allowedTransitions {
|
||||
result[key.from] = append(result[key.from], key.to)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Transition reports whether from may transition to next. The function
|
||||
// returns nil when the pair is permitted, and an *InvalidTransitionError
|
||||
// wrapping ErrInvalidTransition otherwise. It does not touch any store
|
||||
// and is safe to call from any layer.
|
||||
func Transition(from Status, next Status) error {
|
||||
if !from.IsKnown() || !next.IsKnown() {
|
||||
return &InvalidTransitionError{From: from, To: next}
|
||||
}
|
||||
if _, ok := allowedTransitions[transitionKey{from: from, to: next}]; !ok {
|
||||
return &InvalidTransitionError{From: from, To: next}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestTransitionAllowed(t *testing.T) {
|
||||
cases := []struct {
|
||||
from Status
|
||||
to Status
|
||||
}{
|
||||
{StatusRunning, StatusStopped},
|
||||
{StatusRunning, StatusRemoved},
|
||||
{StatusStopped, StatusRunning},
|
||||
{StatusStopped, StatusRemoved},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
assert.NoErrorf(t, Transition(tc.from, tc.to),
|
||||
"expected %q -> %q allowed", tc.from, tc.to)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTransitionRejected(t *testing.T) {
|
||||
cases := []struct {
|
||||
from Status
|
||||
to Status
|
||||
}{
|
||||
{StatusRemoved, StatusRunning},
|
||||
{StatusRemoved, StatusStopped},
|
||||
{StatusRemoved, StatusRemoved},
|
||||
{StatusRunning, StatusRunning},
|
||||
{StatusStopped, StatusStopped},
|
||||
{Status("unknown"), StatusRunning},
|
||||
{StatusRunning, Status("unknown")},
|
||||
{Status(""), Status("")},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
err := Transition(tc.from, tc.to)
|
||||
require.Errorf(t, err, "expected %q -> %q rejected", tc.from, tc.to)
|
||||
assert.ErrorIs(t, err, ErrInvalidTransition)
|
||||
|
||||
var transitionErr *InvalidTransitionError
|
||||
require.True(t, errors.As(err, &transitionErr),
|
||||
"expected *InvalidTransitionError for %q -> %q", tc.from, tc.to)
|
||||
assert.Equal(t, tc.from, transitionErr.From)
|
||||
assert.Equal(t, tc.to, transitionErr.To)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllowedTransitionsReturnsCopy(t *testing.T) {
|
||||
first := AllowedTransitions()
|
||||
require.NotEmpty(t, first)
|
||||
|
||||
for from := range first {
|
||||
first[from] = nil
|
||||
}
|
||||
|
||||
second := AllowedTransitions()
|
||||
assert.NotEmpty(t, second[StatusRunning],
|
||||
"AllowedTransitions must return an independent map per call")
|
||||
}
|
||||
|
||||
func TestAllowedTransitionsCoversFourPairs(t *testing.T) {
|
||||
transitions := AllowedTransitions()
|
||||
|
||||
assert.ElementsMatch(t,
|
||||
[]Status{StatusStopped, StatusRemoved},
|
||||
transitions[StatusRunning],
|
||||
)
|
||||
assert.ElementsMatch(t,
|
||||
[]Status{StatusRunning, StatusRemoved},
|
||||
transitions[StatusStopped],
|
||||
)
|
||||
assert.Empty(t, transitions[StatusRemoved],
|
||||
"removed has no outgoing transitions")
|
||||
}
|
||||
|
||||
func TestInvalidTransitionErrorMessage(t *testing.T) {
|
||||
err := &InvalidTransitionError{From: StatusRunning, To: Status("bogus")}
|
||||
assert.Contains(t, err.Error(), "running")
|
||||
assert.Contains(t, err.Error(), "bogus")
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package logging
|
||||
|
||||
import "context"
|
||||
|
||||
// requestIDKey is the unexported context key under which the HTTP layer
|
||||
// stores the request id propagated from the X-Request-Id header.
|
||||
type requestIDKey struct{}
|
||||
|
||||
// WithRequestID returns a child context that carries requestID. An empty
|
||||
// requestID returns ctx unchanged so callers do not have to branch.
|
||||
func WithRequestID(ctx context.Context, requestID string) context.Context {
|
||||
if ctx == nil || requestID == "" {
|
||||
return ctx
|
||||
}
|
||||
return context.WithValue(ctx, requestIDKey{}, requestID)
|
||||
}
|
||||
|
||||
// RequestIDFromContext returns the request id stored on ctx by
|
||||
// WithRequestID, or an empty string when no value is present.
|
||||
func RequestIDFromContext(ctx context.Context) string {
|
||||
if ctx == nil {
|
||||
return ""
|
||||
}
|
||||
value, _ := ctx.Value(requestIDKey{}).(string)
|
||||
return value
|
||||
}
|
||||
|
||||
// ContextAttrs returns slog key-value pairs that materialise the frozen
|
||||
// `rtmanager/README.md` §Observability log fields `request_id`,
|
||||
// `trace_id`, and `span_id` from ctx. Pairs whose value is empty are
|
||||
// omitted so logs stay tight.
|
||||
func ContextAttrs(ctx context.Context) []any {
|
||||
if ctx == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var attrs []any
|
||||
if requestID := RequestIDFromContext(ctx); requestID != "" {
|
||||
attrs = append(attrs, "request_id", requestID)
|
||||
}
|
||||
attrs = append(attrs, TraceAttrsFromContext(ctx)...)
|
||||
return attrs
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
// Package logging configures the Runtime Manager process logger and
|
||||
// provides context-aware helpers for trace fields.
|
||||
package logging
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
// New constructs the process-wide JSON logger from level.
|
||||
func New(level string) (*slog.Logger, error) {
|
||||
var slogLevel slog.Level
|
||||
if err := slogLevel.UnmarshalText([]byte(strings.TrimSpace(level))); err != nil {
|
||||
return nil, fmt.Errorf("build logger: %w", err)
|
||||
}
|
||||
|
||||
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: slogLevel,
|
||||
})), nil
|
||||
}
|
||||
|
||||
// TraceAttrsFromContext returns slog key-value pairs for the active
|
||||
// OpenTelemetry span when ctx carries a valid span context. The keys match
|
||||
// the frozen `rtmanager/README.md` §Observability log fields `trace_id`
|
||||
// and `span_id`.
|
||||
func TraceAttrsFromContext(ctx context.Context) []any {
|
||||
if ctx == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
spanContext := trace.SpanContextFromContext(ctx)
|
||||
if !spanContext.IsValid() {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []any{
|
||||
"trace_id", spanContext.TraceID().String(),
|
||||
"span_id", spanContext.SpanID().String(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,336 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PullPolicy enumerates the supported image pull policies. The value
|
||||
// set mirrors `config.ImagePullPolicy`; the runtime/wiring layer
|
||||
// translates between the two so the docker adapter does not import
|
||||
// `internal/config` and the port package stays free of configuration
|
||||
// concerns.
|
||||
type PullPolicy string
|
||||
|
||||
// Supported pull policies, frozen by `rtmanager/README.md §Configuration`.
|
||||
const (
|
||||
// PullPolicyIfMissing pulls the image only when it is absent from
|
||||
// the local Docker daemon.
|
||||
PullPolicyIfMissing PullPolicy = "if_missing"
|
||||
|
||||
// PullPolicyAlways pulls the image on every start.
|
||||
PullPolicyAlways PullPolicy = "always"
|
||||
|
||||
// PullPolicyNever skips the pull and fails the start when the image
|
||||
// is absent.
|
||||
PullPolicyNever PullPolicy = "never"
|
||||
)
|
||||
|
||||
// IsKnown reports whether policy belongs to the frozen pull-policy
|
||||
// vocabulary.
|
||||
func (policy PullPolicy) IsKnown() bool {
|
||||
switch policy {
|
||||
case PullPolicyIfMissing, PullPolicyAlways, PullPolicyNever:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
//go:generate go run go.uber.org/mock/mockgen -destination=../adapters/docker/mocks/mock_dockerclient.go -package=mocks galaxy/rtmanager/internal/ports DockerClient
|
||||
|
||||
// DockerClient is the narrow Docker port Runtime Manager uses. The
|
||||
// production adapter wraps `github.com/docker/docker/client`; service
|
||||
// tests use a generated mock. The surface intentionally exposes only
|
||||
// the operations RTM needs; `docker logs` and stream attach are out
|
||||
// of scope for v1.
|
||||
type DockerClient interface {
|
||||
// EnsureNetwork verifies the configured Docker network is present
|
||||
// on the daemon. It returns ErrNetworkMissing when the network does
|
||||
// not exist; RTM never creates networks itself.
|
||||
EnsureNetwork(ctx context.Context, name string) error
|
||||
|
||||
// PullImage pulls ref according to policy. It returns nil on
|
||||
// success and a wrapped Docker error otherwise. Implementations
|
||||
// honour PullPolicyNever by skipping the pull and returning nil
|
||||
// when the image is already present, or returning ErrImageNotFound
|
||||
// otherwise.
|
||||
PullImage(ctx context.Context, ref string, policy PullPolicy) error
|
||||
|
||||
// InspectImage returns image metadata for ref. It returns
|
||||
// ErrImageNotFound when no such image exists locally.
|
||||
InspectImage(ctx context.Context, ref string) (ImageInspect, error)
|
||||
|
||||
// InspectContainer returns container metadata for containerID. It
|
||||
// returns ErrContainerNotFound when no such container exists.
|
||||
InspectContainer(ctx context.Context, containerID string) (ContainerInspect, error)
|
||||
|
||||
// Run creates and starts one container according to spec. The
|
||||
// returned RunResult carries the assigned container id, the stable
|
||||
// engine endpoint, and the wall-clock observed by the daemon.
|
||||
Run(ctx context.Context, spec RunSpec) (RunResult, error)
|
||||
|
||||
// Stop sends SIGTERM to the container followed by SIGKILL after
|
||||
// timeout. It returns nil when the container exited cleanly and
|
||||
// ErrContainerNotFound when it is already gone.
|
||||
Stop(ctx context.Context, containerID string, timeout time.Duration) error
|
||||
|
||||
// Remove removes the container. It returns nil when the container
|
||||
// no longer exists (idempotent removal).
|
||||
Remove(ctx context.Context, containerID string) error
|
||||
|
||||
// List returns container summaries that match filter. Implementations
|
||||
// translate ListFilter into the appropriate Docker filters argument.
|
||||
List(ctx context.Context, filter ListFilter) ([]ContainerSummary, error)
|
||||
|
||||
// EventsListen subscribes to the Docker events stream and returns
|
||||
// the decoded event channel together with an asynchronous error
|
||||
// channel. The caller cancels ctx to terminate the subscription.
|
||||
// Implementations close events when the subscription terminates.
|
||||
EventsListen(ctx context.Context) (events <-chan DockerEvent, errs <-chan error, err error)
|
||||
}
|
||||
|
||||
// RunSpec stores the request shape used by DockerClient.Run.
|
||||
type RunSpec struct {
|
||||
// Name stores the container name (typically `galaxy-game-{game_id}`).
|
||||
Name string
|
||||
|
||||
// Image stores the image reference resolved by the producer.
|
||||
Image string
|
||||
|
||||
// Hostname stores the container hostname assigned for the embedded
|
||||
// Docker DNS to resolve from other containers on the network.
|
||||
Hostname string
|
||||
|
||||
// Network stores the user-defined Docker network the container
|
||||
// attaches to.
|
||||
Network string
|
||||
|
||||
// Env stores the environment variables forwarded to the container
|
||||
// (e.g. GAME_STATE_PATH, STORAGE_PATH).
|
||||
Env map[string]string
|
||||
|
||||
// Cmd overrides the entrypoint arguments for the container. Production
|
||||
// callers leave it nil so the engine image's own CMD runs; tests use
|
||||
// it to drive a tiny container that does not embed RTM-specific
|
||||
// behaviour. Empty Cmd means "use image default", which mirrors the
|
||||
// Docker SDK contract.
|
||||
Cmd []string
|
||||
|
||||
// Labels stores the labels applied to the container so the
|
||||
// reconciler and the events listener can identify it.
|
||||
Labels map[string]string
|
||||
|
||||
// BindMounts stores the host-to-container bind mounts. RTM uses
|
||||
// exactly one mount in v1 (the per-game state directory).
|
||||
BindMounts []BindMount
|
||||
|
||||
// LogDriver stores the Docker logging driver name.
|
||||
LogDriver string
|
||||
|
||||
// LogOpts stores the logging-driver options as key=value pairs.
|
||||
LogOpts map[string]string
|
||||
|
||||
// CPUQuota stores the `--cpus` value applied as a resource limit.
|
||||
CPUQuota float64
|
||||
|
||||
// Memory stores the `--memory` value (e.g. `512m`) applied as a
|
||||
// resource limit.
|
||||
Memory string
|
||||
|
||||
// PIDsLimit stores the `--pids-limit` value.
|
||||
PIDsLimit int
|
||||
}
|
||||
|
||||
// BindMount stores one host-to-container bind mount.
|
||||
type BindMount struct {
|
||||
// HostPath stores the absolute host path bound into the container.
|
||||
HostPath string
|
||||
|
||||
// MountPath stores the absolute in-container path the host
|
||||
// directory is mounted at.
|
||||
MountPath string
|
||||
|
||||
// ReadOnly mounts the host path read-only when true.
|
||||
ReadOnly bool
|
||||
}
|
||||
|
||||
// RunResult stores the response shape returned by DockerClient.Run.
|
||||
type RunResult struct {
|
||||
// ContainerID identifies the created container.
|
||||
ContainerID string
|
||||
|
||||
// EngineEndpoint stores the stable URL Game Master uses to reach
|
||||
// the engine container.
|
||||
EngineEndpoint string
|
||||
|
||||
// StartedAt stores the wall-clock the daemon observed for the
|
||||
// start event.
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
// ImageInspect stores the subset of `docker image inspect` fields RTM
|
||||
// reads. Only Labels are required at start time (resource limits live
|
||||
// there); other fields may be populated when convenient for diagnostics.
|
||||
type ImageInspect struct {
|
||||
// Ref stores the image reference the inspection was scoped to.
|
||||
Ref string
|
||||
|
||||
// Labels stores the image-level labels (e.g.
|
||||
// `com.galaxy.cpu_quota`).
|
||||
Labels map[string]string
|
||||
}
|
||||
|
||||
// ContainerInspect stores the subset of `docker inspect` fields RTM
|
||||
// reads from a running or exited container.
|
||||
type ContainerInspect struct {
|
||||
// ID identifies the container.
|
||||
ID string
|
||||
|
||||
// ImageRef stores the image reference the container was started
|
||||
// from.
|
||||
ImageRef string
|
||||
|
||||
// Hostname stores the container hostname.
|
||||
Hostname string
|
||||
|
||||
// Labels stores the container labels assigned at create time.
|
||||
Labels map[string]string
|
||||
|
||||
// Status stores the verbatim Docker `State.Status` value (e.g.
|
||||
// `running`, `exited`).
|
||||
Status string
|
||||
|
||||
// Health stores the verbatim Docker `State.Health.Status` value
|
||||
// (e.g. `healthy`, `unhealthy`). Empty when the image declares no
|
||||
// HEALTHCHECK.
|
||||
Health string
|
||||
|
||||
// RestartCount stores the Docker `RestartCount` observed at
|
||||
// inspection time.
|
||||
RestartCount int
|
||||
|
||||
// StartedAt stores the daemon-observed start wall-clock.
|
||||
StartedAt time.Time
|
||||
|
||||
// FinishedAt stores the daemon-observed exit wall-clock. Zero when
|
||||
// the container is still running.
|
||||
FinishedAt time.Time
|
||||
|
||||
// ExitCode stores the exit code reported by the daemon. Zero when
|
||||
// the container is still running.
|
||||
ExitCode int
|
||||
|
||||
// OOMKilled reports whether the container was killed by the OOM
|
||||
// killer.
|
||||
OOMKilled bool
|
||||
}
|
||||
|
||||
// ContainerSummary stores the subset of `docker ps` fields RTM reads.
|
||||
type ContainerSummary struct {
|
||||
// ID identifies the container.
|
||||
ID string
|
||||
|
||||
// ImageRef stores the image reference.
|
||||
ImageRef string
|
||||
|
||||
// Hostname stores the container hostname.
|
||||
Hostname string
|
||||
|
||||
// Labels stores the container labels assigned at create time.
|
||||
Labels map[string]string
|
||||
|
||||
// Status stores the verbatim Docker `State.Status` value.
|
||||
Status string
|
||||
|
||||
// StartedAt stores the daemon-observed start wall-clock.
|
||||
StartedAt time.Time
|
||||
}
|
||||
|
||||
// ListFilter stores the criteria used by DockerClient.List.
|
||||
type ListFilter struct {
|
||||
// Labels stores label key=value pairs that must all be present on
|
||||
// the container. Empty matches every container.
|
||||
Labels map[string]string
|
||||
}
|
||||
|
||||
// DockerEvent stores one decoded entry from the Docker events stream.
|
||||
// RTM only consumes container-scoped events.
|
||||
type DockerEvent struct {
|
||||
// Action stores the Docker event action verbatim (e.g. `start`,
|
||||
// `die`, `oom`, `destroy`).
|
||||
Action string
|
||||
|
||||
// ContainerID identifies the container the event refers to.
|
||||
ContainerID string
|
||||
|
||||
// Labels stores the container labels carried by the event
|
||||
// attributes when present.
|
||||
Labels map[string]string
|
||||
|
||||
// ExitCode stores the exit code attribute when applicable (e.g.
|
||||
// `die` events). Zero when the action does not carry one.
|
||||
ExitCode int
|
||||
|
||||
// OccurredAt stores the daemon-observed event wall-clock.
|
||||
OccurredAt time.Time
|
||||
}
|
||||
|
||||
// String returns policy as its stored enum value. Convenient for use in
|
||||
// log fields and error messages.
|
||||
func (policy PullPolicy) String() string {
|
||||
return string(policy)
|
||||
}
|
||||
|
||||
// ErrNetworkMissing reports that the configured Docker network is not
|
||||
// present on the daemon.
|
||||
var ErrNetworkMissing = errors.New("docker network missing")
|
||||
|
||||
// ErrImageNotFound reports that an image reference does not resolve to
|
||||
// a local Docker image.
|
||||
var ErrImageNotFound = errors.New("docker image not found")
|
||||
|
||||
// ErrContainerNotFound reports that a container id does not resolve to
|
||||
// a Docker container.
|
||||
var ErrContainerNotFound = errors.New("docker container not found")
|
||||
|
||||
// Validate reports whether spec carries the structural invariants
|
||||
// required by DockerClient.Run. Adapters use it as the first defence
|
||||
// against malformed specs originating in service code.
|
||||
func (spec RunSpec) Validate() error {
|
||||
if spec.Name == "" {
|
||||
return fmt.Errorf("run spec: name must not be empty")
|
||||
}
|
||||
if spec.Image == "" {
|
||||
return fmt.Errorf("run spec: image must not be empty")
|
||||
}
|
||||
if spec.Hostname == "" {
|
||||
return fmt.Errorf("run spec: hostname must not be empty")
|
||||
}
|
||||
if spec.Network == "" {
|
||||
return fmt.Errorf("run spec: network must not be empty")
|
||||
}
|
||||
if spec.LogDriver == "" {
|
||||
return fmt.Errorf("run spec: log driver must not be empty")
|
||||
}
|
||||
if spec.CPUQuota <= 0 {
|
||||
return fmt.Errorf("run spec: cpu quota must be positive")
|
||||
}
|
||||
if spec.Memory == "" {
|
||||
return fmt.Errorf("run spec: memory must not be empty")
|
||||
}
|
||||
if spec.PIDsLimit <= 0 {
|
||||
return fmt.Errorf("run spec: pids limit must be positive")
|
||||
}
|
||||
for index, mount := range spec.BindMounts {
|
||||
if mount.HostPath == "" {
|
||||
return fmt.Errorf("run spec: bind mounts[%d]: host path must not be empty", index)
|
||||
}
|
||||
if mount.MountPath == "" {
|
||||
return fmt.Errorf("run spec: bind mounts[%d]: mount path must not be empty", index)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GameLeaseStore guards every lifecycle operation Runtime Manager runs
|
||||
// against one game. The lease serialises starts, stops, restarts, patches,
|
||||
// and cleanup operations on the same `game_id` across all entry points
|
||||
// (Lobby stream consumer, GM REST handler, Admin REST handler, periodic
|
||||
// workers) so concurrent operations cannot corrupt each other's
|
||||
// intermediate Docker / PostgreSQL state.
|
||||
//
|
||||
// The lease is a per-game key with a random token. Adapters use SETNX with
|
||||
// PX TTL on TryAcquire and a compare-and-delete on Release so a publisher
|
||||
// that lost the lease (TTL expiry, replica swap) cannot clear another
|
||||
// caller's claim.
|
||||
//
|
||||
// In v1 the lease is not renewed mid-operation; callers must keep the
|
||||
// total operation duration below the configured TTL
|
||||
// (`RTMANAGER_GAME_LEASE_TTL_SECONDS`, default 60s). Multi-GB image pulls
|
||||
// can exceed this in production and remain a known limitation; later
|
||||
// stages may introduce a renewal helper if it bites.
|
||||
type GameLeaseStore interface {
|
||||
// TryAcquire attempts to acquire the per-game lease for gameID owned
|
||||
// by token for ttl. It returns true when the lease was acquired and
|
||||
// false when another holder still owns it. A non-nil error reports
|
||||
// transport-level failures (Redis unreachable, network timeout) and
|
||||
// must not be confused with a missed lease.
|
||||
TryAcquire(ctx context.Context, gameID, token string, ttl time.Duration) (acquired bool, err error)
|
||||
|
||||
// Release removes the per-game lease for gameID only when token still
|
||||
// matches the stored owner value. Releasing a lease the caller no
|
||||
// longer owns is a silent no-op so a TTL-driven release race never
|
||||
// clears another caller's claim.
|
||||
Release(ctx context.Context, gameID, token string) error
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
)
|
||||
|
||||
// HealthEventPublisher emits one entry on the `runtime:health_events`
|
||||
// Redis Stream and updates `health_snapshots` with the latest observation
|
||||
// for the affected game. Adapters publish and snapshot in one call so
|
||||
// every emission durably advances both surfaces; partial publishes (event
|
||||
// without snapshot, or vice versa) are not allowed.
|
||||
//
|
||||
// The start service emits `container_started` through this port; the
|
||||
// periodic Docker inspect, the active probe, and the Docker events
|
||||
// listener publish the rest of the event types through the same port
|
||||
// without changing its surface.
|
||||
type HealthEventPublisher interface {
|
||||
// Publish records envelope on the configured `runtime:health_events`
|
||||
// stream and upserts the matching `health_snapshots` row. A non-nil
|
||||
// error reports a transport or storage failure; the caller treats it
|
||||
// as a degraded emission per `rtmanager/README.md §Notification
|
||||
// Contracts` (the underlying business state is the source of truth,
|
||||
// not the event stream).
|
||||
Publish(ctx context.Context, envelope HealthEventEnvelope) error
|
||||
}
|
||||
|
||||
// HealthEventEnvelope carries the payload published on
|
||||
// `runtime:health_events`. The fields mirror the AsyncAPI schema frozen
|
||||
// in `rtmanager/api/runtime-health-asyncapi.yaml`; adapters serialise
|
||||
// every field verbatim so consumers see the contracted shape.
|
||||
type HealthEventEnvelope struct {
|
||||
// GameID identifies the platform game the event refers to.
|
||||
GameID string
|
||||
|
||||
// ContainerID identifies the Docker container observed by the event
|
||||
// source. May differ from the record's current container id after a
|
||||
// restart race; consumers are expected to treat the value as the
|
||||
// observation's container, not the record's.
|
||||
ContainerID string
|
||||
|
||||
// EventType classifies the event per the frozen vocabulary in
|
||||
// `galaxy/rtmanager/internal/domain/health.EventType`.
|
||||
EventType health.EventType
|
||||
|
||||
// OccurredAt stores the wall-clock at which Runtime Manager observed
|
||||
// the event. Adapters convert it to UTC milliseconds for the wire
|
||||
// payload (`occurred_at_ms`).
|
||||
OccurredAt time.Time
|
||||
|
||||
// Details stores the event-type-specific JSON payload. Adapters
|
||||
// persist and stream it verbatim; nil and empty values are treated as
|
||||
// the canonical empty-object payload.
|
||||
Details json.RawMessage
|
||||
}
|
||||
|
||||
// Validate reports whether envelope satisfies the structural invariants
|
||||
// implied by the AsyncAPI schema.
|
||||
func (envelope HealthEventEnvelope) Validate() error {
|
||||
if strings.TrimSpace(envelope.GameID) == "" {
|
||||
return fmt.Errorf("health event envelope: game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(envelope.ContainerID) == "" {
|
||||
return fmt.Errorf("health event envelope: container id must not be empty")
|
||||
}
|
||||
if !envelope.EventType.IsKnown() {
|
||||
return fmt.Errorf("health event envelope: event type %q is unsupported", envelope.EventType)
|
||||
}
|
||||
if envelope.OccurredAt.IsZero() {
|
||||
return fmt.Errorf("health event envelope: occurred at must not be zero")
|
||||
}
|
||||
if len(envelope.Details) > 0 && !json.Valid(envelope.Details) {
|
||||
return fmt.Errorf("health event envelope: details must be valid JSON when non-empty")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
)
|
||||
|
||||
// HealthSnapshotStore stores the latest technical-health observation per
|
||||
// game. Adapters keep one row per game_id; later observations overwrite.
|
||||
type HealthSnapshotStore interface {
|
||||
// Upsert installs snapshot as the latest observation for
|
||||
// snapshot.GameID. Adapters validate snapshot through
|
||||
// health.HealthSnapshot.Validate before touching the store.
|
||||
Upsert(ctx context.Context, snapshot health.HealthSnapshot) error
|
||||
|
||||
// Get returns the latest snapshot for gameID. It returns
|
||||
// runtime.ErrNotFound (declared in
|
||||
// `galaxy/rtmanager/internal/domain/runtime`) when no snapshot has
|
||||
// been recorded yet.
|
||||
Get(ctx context.Context, gameID string) (health.HealthSnapshot, error)
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// JobResultPublisher emits one entry on the `runtime:job_results` Redis
|
||||
// Stream per finalised start or stop runtime job. Adapters serialise
|
||||
// every JobResult field verbatim so consumers (Game Lobby's
|
||||
// runtime-job-result worker today, future services tomorrow) see the
|
||||
// AsyncAPI shape frozen in `rtmanager/api/runtime-jobs-asyncapi.yaml`.
|
||||
//
|
||||
// The start-jobs and stop-jobs consumers publish through this port.
|
||||
// The synchronous REST handlers do not — REST callers receive the same
|
||||
// `Result` shape directly from the service layer.
|
||||
type JobResultPublisher interface {
|
||||
// Publish records result on the configured `runtime:job_results`
|
||||
// stream. A non-nil error reports a transport or serialisation
|
||||
// failure; the caller treats the failure as a degraded emission
|
||||
// (the operation_log already records the durable outcome).
|
||||
Publish(ctx context.Context, result JobResult) error
|
||||
}
|
||||
|
||||
// JobResult outcome values frozen by the
|
||||
// `RuntimeJobResultPayload.outcome` enum.
|
||||
const (
|
||||
// JobOutcomeSuccess marks a successful start or stop, including the
|
||||
// idempotent replay variant (`error_code=replay_no_op`).
|
||||
JobOutcomeSuccess = "success"
|
||||
|
||||
// JobOutcomeFailure marks a stable failure for which the payload
|
||||
// carries a non-empty `error_code`.
|
||||
JobOutcomeFailure = "failure"
|
||||
)
|
||||
|
||||
// JobResult carries the wire payload published on
|
||||
// `runtime:job_results`. The fields mirror the AsyncAPI schema frozen
|
||||
// in `rtmanager/api/runtime-jobs-asyncapi.yaml`; adapters serialise
|
||||
// every field verbatim so consumers see the contracted shape. Fields
|
||||
// that are required by the contract (every field on this struct) are
|
||||
// always present in the wire entry — even when their string value is
|
||||
// empty (allowed for `container_id` / `engine_endpoint` / `error_code`
|
||||
// / `error_message` on appropriate variants).
|
||||
type JobResult struct {
|
||||
// GameID identifies the platform game the job acted on. Required.
|
||||
GameID string
|
||||
|
||||
// Outcome reports the high-level outcome. Must be `success` or
|
||||
// `failure` (use the JobOutcome* constants).
|
||||
Outcome string
|
||||
|
||||
// ContainerID stores the Docker container id. Populated on
|
||||
// `success` for fresh starts and replays; empty on `failure` and
|
||||
// on `success/replay_no_op` for stop jobs that observed a removed
|
||||
// record.
|
||||
ContainerID string
|
||||
|
||||
// EngineEndpoint stores the stable engine URL
|
||||
// `http://galaxy-game-{game_id}:8080`. Populated alongside
|
||||
// ContainerID, empty in the same cases.
|
||||
EngineEndpoint string
|
||||
|
||||
// ErrorCode stores the stable error code from
|
||||
// `rtmanager/README.md §Error Model`. Empty for fresh successes,
|
||||
// `replay_no_op` for idempotent replays, one of the failure
|
||||
// codes otherwise.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail. Empty for
|
||||
// successes; populated alongside ErrorCode on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Validate reports whether result satisfies the structural invariants
|
||||
// implied by the AsyncAPI schema: a non-empty game id and one of the
|
||||
// two known outcome values. The remaining fields are required to be
|
||||
// present on the wire but may be empty strings, so Validate does not
|
||||
// constrain them.
|
||||
func (result JobResult) Validate() error {
|
||||
if strings.TrimSpace(result.GameID) == "" {
|
||||
return fmt.Errorf("job result: game id must not be empty")
|
||||
}
|
||||
switch result.Outcome {
|
||||
case JobOutcomeSuccess, JobOutcomeFailure:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("job result: outcome %q is unsupported", result.Outcome)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
)
|
||||
|
||||
// LobbyInternalClient is the synchronous trusted-REST port Runtime
|
||||
// Manager uses to read ancillary game metadata from Game Lobby. Stage
|
||||
// 13 calls GetGame purely for diagnostic context; the start envelope
|
||||
// already carries the only required field (`image_ref`) so a
|
||||
// LobbyInternalClient failure must not abort the start operation.
|
||||
type LobbyInternalClient interface {
|
||||
// GetGame returns the Lobby game record for gameID. It returns
|
||||
// ErrLobbyGameNotFound when no record exists and ErrLobbyUnavailable
|
||||
// for transport / timeout / non-2xx responses.
|
||||
GetGame(ctx context.Context, gameID string) (LobbyGameRecord, error)
|
||||
}
|
||||
|
||||
// LobbyGameRecord stores the subset of the Lobby `GameRecord` schema
|
||||
// Runtime Manager uses. The shape is intentionally minimal: this fetch
|
||||
// is ancillary diagnostics and v1 has no required field. The struct
|
||||
// may be extended additively without breaking existing callers.
|
||||
type LobbyGameRecord struct {
|
||||
// GameID identifies the platform game.
|
||||
GameID string
|
||||
|
||||
// Status stores the verbatim Lobby status string (e.g. `starting`,
|
||||
// `running`, `paused`). Runtime Manager does not interpret it; it
|
||||
// is exposed for log enrichment and diagnostics only.
|
||||
Status string
|
||||
|
||||
// TargetEngineVersion stores the semver of the engine version Lobby
|
||||
// resolved into the start envelope's image_ref. Empty when Lobby
|
||||
// did not return one.
|
||||
TargetEngineVersion string
|
||||
}
|
||||
|
||||
// ErrLobbyGameNotFound reports that the Lobby internal API returned 404
|
||||
// for the requested game id.
|
||||
var ErrLobbyGameNotFound = errors.New("lobby game not found")
|
||||
|
||||
// ErrLobbyUnavailable reports that the Lobby internal API could not be
|
||||
// reached (transport error, timeout, non-2xx response). Callers must
|
||||
// treat the failure as recoverable: Runtime Manager continues the
|
||||
// operation when the call is purely diagnostic.
|
||||
var ErrLobbyUnavailable = errors.New("lobby internal api unavailable")
|
||||
@@ -0,0 +1,25 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
)
|
||||
|
||||
// NotificationIntentPublisher is the producer port Runtime Manager uses
|
||||
// to publish admin-only notification intents to Notification Service.
|
||||
// The production adapter is a thin wrapper around
|
||||
// `notificationintent.Publisher`; the wrapper drops the entry id
|
||||
// returned by the underlying publisher because Runtime Manager does
|
||||
// not track per-intent ids in v1.
|
||||
//
|
||||
// A failed Publish call is a notification degradation per
|
||||
// `galaxy/rtmanager/README.md §Notification Contracts` and must not roll
|
||||
// back already committed business state. Callers log the error and
|
||||
// proceed.
|
||||
type NotificationIntentPublisher interface {
|
||||
// Publish normalises intent and appends it to the configured Redis
|
||||
// Stream. Validation failures and transport errors are returned
|
||||
// verbatim.
|
||||
Publish(ctx context.Context, intent notificationintent.Intent) error
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
)
|
||||
|
||||
// OperationLogStore stores append-only audit entries for every
|
||||
// lifecycle operation Runtime Manager performed against a game's
|
||||
// runtime. Adapters must persist entry verbatim and return the
|
||||
// generated bigserial id from Append.
|
||||
type OperationLogStore interface {
|
||||
// Append inserts entry into the operation log and returns the
|
||||
// generated bigserial id. Adapters validate entry through
|
||||
// operation.OperationEntry.Validate before touching the store.
|
||||
Append(ctx context.Context, entry operation.OperationEntry) (id int64, err error)
|
||||
|
||||
// ListByGame returns the most recent entries for gameID, ordered by
|
||||
// started_at descending and capped by limit. A non-positive limit
|
||||
// is rejected as invalid input by adapters.
|
||||
ListByGame(ctx context.Context, gameID string, limit int) ([]operation.OperationEntry, error)
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
// Package ports defines the stable interfaces that connect Runtime
|
||||
// Manager use cases to external state and external services.
|
||||
package ports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
)
|
||||
|
||||
// RuntimeRecordStore stores runtime records and exposes the operations
|
||||
// used by the service layer (Stages 13+) and the workers (Stages 15-18).
|
||||
// Adapters must preserve domain semantics:
|
||||
//
|
||||
// - Get returns runtime.ErrNotFound when no record exists for gameID.
|
||||
// - Upsert installs a record verbatim; the caller is responsible for
|
||||
// domain validation through runtime.RuntimeRecord.Validate.
|
||||
// - UpdateStatus applies one transition through a compare-and-swap
|
||||
// guard on (status, current_container_id) and returns
|
||||
// runtime.ErrConflict on a stale CAS.
|
||||
// - List returns every record currently stored, regardless of status.
|
||||
// - ListByStatus returns every record currently indexed under status.
|
||||
type RuntimeRecordStore interface {
|
||||
// Get returns the record identified by gameID. It returns
|
||||
// runtime.ErrNotFound when no record exists.
|
||||
Get(ctx context.Context, gameID string) (runtime.RuntimeRecord, error)
|
||||
|
||||
// Upsert inserts record when no row exists for record.GameID and
|
||||
// otherwise overwrites every column verbatim. The start service uses
|
||||
// Upsert to install fresh records on start, the inner start of
|
||||
// restart and patch, and the reconcile_adopt path.
|
||||
Upsert(ctx context.Context, record runtime.RuntimeRecord) error
|
||||
|
||||
// UpdateStatus applies one status transition in a compare-and-swap
|
||||
// fashion. The adapter must first call runtime.Transition to reject
|
||||
// invalid pairs without touching the store, then verify that the
|
||||
// stored status equals input.ExpectedFrom, and (when
|
||||
// input.ExpectedContainerID is non-empty) that the stored
|
||||
// current_container_id equals it. The adapter derives stopped_at /
|
||||
// removed_at and updates last_op_at from input.Now per the
|
||||
// destination status.
|
||||
UpdateStatus(ctx context.Context, input UpdateStatusInput) error
|
||||
|
||||
// List returns every runtime record currently stored. Used by the
|
||||
// internal REST list endpoint; the v1 working set is bounded by the
|
||||
// games tracked by Lobby and is small enough to return in one
|
||||
// response (pagination is not supported). The order is
|
||||
// adapter-defined; callers may reorder as needed.
|
||||
List(ctx context.Context) ([]runtime.RuntimeRecord, error)
|
||||
|
||||
// ListByStatus returns every record currently indexed under status.
|
||||
// The order is adapter-defined; callers may reorder as needed.
|
||||
ListByStatus(ctx context.Context, status runtime.Status) ([]runtime.RuntimeRecord, error)
|
||||
}
|
||||
|
||||
// UpdateStatusInput stores the arguments required to apply one status
|
||||
// transition through a RuntimeRecordStore. The adapter is responsible
|
||||
// for translating the destination status into the matching column
|
||||
// updates (stopped_at / removed_at / current_container_id NULLing) and
|
||||
// for the CAS guard.
|
||||
type UpdateStatusInput struct {
|
||||
// GameID identifies the record to mutate.
|
||||
GameID string
|
||||
|
||||
// ExpectedFrom stores the status the caller believes the record
|
||||
// currently has. A mismatch results in runtime.ErrConflict.
|
||||
ExpectedFrom runtime.Status
|
||||
|
||||
// ExpectedContainerID is an optional CAS guard. When non-empty, the
|
||||
// adapter rejects the update with runtime.ErrConflict if the stored
|
||||
// current_container_id does not equal it. Used by stop / cleanup /
|
||||
// reconcile to protect against concurrent restart races. Empty
|
||||
// disables the container-id CAS while keeping the status CAS.
|
||||
ExpectedContainerID string
|
||||
|
||||
// To stores the destination status.
|
||||
To runtime.Status
|
||||
|
||||
// Now stores the wall-clock used to derive stopped_at / removed_at
|
||||
// and last_op_at depending on To.
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
// Validate reports whether input contains a structurally valid status
|
||||
// transition request. Adapters call Validate before touching the store.
|
||||
func (input UpdateStatusInput) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("update runtime status: game id must not be empty")
|
||||
}
|
||||
if !input.ExpectedFrom.IsKnown() {
|
||||
return fmt.Errorf(
|
||||
"update runtime status: expected from status %q is unsupported",
|
||||
input.ExpectedFrom,
|
||||
)
|
||||
}
|
||||
if !input.To.IsKnown() {
|
||||
return fmt.Errorf(
|
||||
"update runtime status: to status %q is unsupported",
|
||||
input.To,
|
||||
)
|
||||
}
|
||||
if err := runtime.Transition(input.ExpectedFrom, input.To); err != nil {
|
||||
return fmt.Errorf("update runtime status: %w", err)
|
||||
}
|
||||
if input.Now.IsZero() {
|
||||
return fmt.Errorf("update runtime status: now must not be zero")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
package ports
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func validUpdateStatusInput() UpdateStatusInput {
|
||||
return UpdateStatusInput{
|
||||
GameID: "game-test",
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: "container-1",
|
||||
To: runtime.StatusStopped,
|
||||
Now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateHappy(t *testing.T) {
|
||||
require.NoError(t, validUpdateStatusInput().Validate())
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateAcceptsEmptyContainerCAS(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
input.ExpectedContainerID = ""
|
||||
|
||||
assert.NoError(t, input.Validate())
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateRejects(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*UpdateStatusInput)
|
||||
}{
|
||||
{"empty game id", func(i *UpdateStatusInput) { i.GameID = "" }},
|
||||
{"unknown expected from", func(i *UpdateStatusInput) {
|
||||
i.ExpectedFrom = "exotic"
|
||||
}},
|
||||
{"unknown to", func(i *UpdateStatusInput) {
|
||||
i.To = "exotic"
|
||||
}},
|
||||
{"zero now", func(i *UpdateStatusInput) {
|
||||
i.Now = time.Time{}
|
||||
}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
tt.mutate(&input)
|
||||
assert.Error(t, input.Validate())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStatusInputValidateRejectsForbiddenTransition(t *testing.T) {
|
||||
input := validUpdateStatusInput()
|
||||
input.ExpectedFrom = runtime.StatusRemoved
|
||||
input.To = runtime.StatusRunning
|
||||
|
||||
err := input.Validate()
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, runtime.ErrInvalidTransition),
|
||||
"want runtime.ErrInvalidTransition, got %v", err)
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package ports
|
||||
|
||||
import "context"
|
||||
|
||||
// StreamOffsetStore persists the last successfully processed Redis
|
||||
// Stream entry id per consumer label. Workers call Load on startup to
|
||||
// resume from the persisted offset and Save after every successful
|
||||
// message handling so the next iteration advances past the
|
||||
// just-processed entry. The label is the short logical identifier of
|
||||
// the consumer (e.g. `start_jobs`, `stop_jobs`), not the full stream
|
||||
// name; it stays stable when the underlying stream key is renamed.
|
||||
type StreamOffsetStore interface {
|
||||
// Load returns the last processed entry id for the consumer
|
||||
// labelled stream when one is stored. The boolean return reports
|
||||
// whether a value was present; implementations must not return an
|
||||
// error for a missing key.
|
||||
Load(ctx context.Context, stream string) (entryID string, found bool, err error)
|
||||
|
||||
// Save stores entryID as the new last processed offset for the
|
||||
// consumer labelled stream. Implementations overwrite any previous
|
||||
// value unconditionally.
|
||||
Save(ctx context.Context, stream, entryID string) error
|
||||
}
|
||||
@@ -0,0 +1,442 @@
|
||||
// Package cleanupcontainer implements the `cleanup_container` lifecycle
|
||||
// operation owned by Runtime Manager. The service removes the Docker
|
||||
// container of an already-stopped runtime and transitions the record
|
||||
// to `removed`. It refuses to operate on a still-running runtime —
|
||||
// callers must stop first.
|
||||
//
|
||||
// Two callers exercise this surface: the administrative
|
||||
// `DELETE /api/v1/internal/runtimes/{game_id}/container` endpoint, and
|
||||
// the periodic container-cleanup worker that walks
|
||||
// `runtime_records.status='stopped'` rows older than
|
||||
// `RTMANAGER_CONTAINER_RETENTION_DAYS`. Both paths flow through Handle.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Cleanup`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package cleanupcontainer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one cleanup operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game whose container is removed.
|
||||
GameID string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, admin user id). Empty for the periodic auto-TTL
|
||||
// caller.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the updated runtime record on success and on
|
||||
// idempotent replay; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
Docker ports.DockerClient
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
Logger *slog.Logger
|
||||
Clock func() time.Time
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the cleanup_container lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new cleanup container service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new cleanup container service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new cleanup container service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new cleanup container service: nil lease store")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new cleanup container service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new cleanup container service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.cleanupcontainer")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one cleanup operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success, idempotent replay, or
|
||||
// any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("cleanup container: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("cleanup container: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected cleanup steps.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
switch existing.Status {
|
||||
case runtime.StatusRemoved:
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
case runtime.StatusRunning:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is running; stop the runtime first", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
case runtime.StatusStopped:
|
||||
// proceed
|
||||
default:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if existing.CurrentContainerID != "" {
|
||||
if err := service.docker.Remove(ctx, existing.CurrentContainerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
updateNow := service.clock().UTC()
|
||||
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusStopped,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
// CAS race: another caller (reconciler dispose, concurrent admin)
|
||||
// already moved the record. The desired terminal state was
|
||||
// reached by another path.
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-cleanup", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusRemoved
|
||||
record.CurrentContainerID = ""
|
||||
removedAt := updateNow
|
||||
record.RemovedAt = &removedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime container cleaned up", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns the
|
||||
// existing record unchanged.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeSuccess), string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime cleanup replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
imageRef string
|
||||
}
|
||||
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindCleanupContainer,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordCleanupOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime cleanup failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,382 @@
|
||||
package cleanupcontainer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/cleanupcontainer"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles ----------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
updateStatusErr error
|
||||
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
|
||||
return errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
if input.To == runtime.StatusRemoved {
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in cleanup tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *cleanupcontainer.Service {
|
||||
t.Helper()
|
||||
service, err := cleanupcontainer.NewService(cleanupcontainer.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() cleanupcontainer.Input {
|
||||
return cleanupcontainer.Input{
|
||||
GameID: "game-1",
|
||||
OpSource: operation.OpSourceAdminRest,
|
||||
SourceRef: "rest-cleanup-1",
|
||||
}
|
||||
}
|
||||
|
||||
func stoppedRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-2 * time.Hour)
|
||||
stoppedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -----------------------------------------------------
|
||||
|
||||
func TestHandleCleanupHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
assert.Empty(t, result.Record.CurrentContainerID)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].ExpectedFrom)
|
||||
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindCleanupContainer, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
}
|
||||
|
||||
// --- replay ---------------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := stoppedRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-30 * time.Minute)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Empty(t, h.records.updates)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failure paths --------------------------------------------------
|
||||
|
||||
func TestHandleConflictOnRunningRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
running := stoppedRecord(h.now)
|
||||
running.Status = runtime.StatusRunning
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
running.StartedAt = &startedAt
|
||||
running.StoppedAt = nil
|
||||
h.records.stored["game-1"] = running
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "stop the runtime first")
|
||||
}
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Empty(t, h.records.updates, "no record mutation on docker remove failure")
|
||||
}
|
||||
|
||||
func TestHandleInternalErrorOnGenericUpdateError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = stoppedRecord(h.now)
|
||||
h.records.updateStatusErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []cleanupcontainer.Input{
|
||||
{GameID: "", OpSource: operation.OpSourceAdminRest},
|
||||
{GameID: "g", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ---------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := cleanupcontainer.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := cleanupcontainer.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package patchruntime
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/distribution/reference"
|
||||
"golang.org/x/mod/semver"
|
||||
)
|
||||
|
||||
// errImageRefNoTag reports that an image reference does not declare a
|
||||
// tag. The patch service maps it to `image_ref_not_semver` because a
|
||||
// digest-only or tagless reference cannot carry a semver-comparable
|
||||
// version.
|
||||
var errImageRefNoTag = errors.New("image reference is missing a tag")
|
||||
|
||||
// extractSemverTag returns the canonical semver string ("v1.4.7") for
|
||||
// imageRef, ready to feed into golang.org/x/mod/semver. The leading "v"
|
||||
// is added when the underlying tag omits it.
|
||||
//
|
||||
// Errors returned by this function are pre-formatted for inclusion in
|
||||
// the patch service's `image_ref_not_semver` failure message.
|
||||
func extractSemverTag(imageRef string) (string, error) {
|
||||
parsed, err := reference.ParseNormalizedNamed(imageRef)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse image reference %q: %w", imageRef, err)
|
||||
}
|
||||
tagged, ok := parsed.(reference.NamedTagged)
|
||||
if !ok {
|
||||
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
|
||||
}
|
||||
tag := strings.TrimSpace(tagged.Tag())
|
||||
if tag == "" {
|
||||
return "", fmt.Errorf("%w: %q", errImageRefNoTag, imageRef)
|
||||
}
|
||||
candidate := tag
|
||||
if !strings.HasPrefix(candidate, "v") {
|
||||
candidate = "v" + candidate
|
||||
}
|
||||
if !semver.IsValid(candidate) {
|
||||
return "", fmt.Errorf("tag %q on image reference %q is not a valid semver", tag, imageRef)
|
||||
}
|
||||
return candidate, nil
|
||||
}
|
||||
|
||||
// samePatchSeries reports whether two canonical semver strings (with
|
||||
// the leading "v") share their major and minor components. The third
|
||||
// component (patch) and any pre-release / build metadata are ignored.
|
||||
func samePatchSeries(currentSemver, newSemver string) bool {
|
||||
return semver.MajorMinor(currentSemver) == semver.MajorMinor(newSemver)
|
||||
}
|
||||
@@ -0,0 +1,483 @@
|
||||
// Package patchruntime implements the `patch` lifecycle operation owned
|
||||
// by Runtime Manager. Patch is restart with a new `image_ref`: under
|
||||
// one outer per-game lease the service runs the stop service, removes
|
||||
// the container, and runs the start service with the new image. The
|
||||
// engine reads its state from the bind-mount on startup, so any data
|
||||
// written before the patch survives.
|
||||
//
|
||||
// The new and current image references must both parse as semver tags
|
||||
// and share their major and minor components. A new tag that bumps the
|
||||
// major or the minor surfaces as `semver_patch_only`; a tag that is
|
||||
// not parseable as semver surfaces as `image_ref_not_semver`. These
|
||||
// pre-checks run before any Docker work so a rejected patch never
|
||||
// disturbs the running runtime.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Patch`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package patchruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one patch operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to patch.
|
||||
GameID string
|
||||
|
||||
// NewImageRef stores the new Docker reference the patch installs.
|
||||
// Must be a valid Docker reference whose tag parses as semver.
|
||||
NewImageRef string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference. When
|
||||
// non-empty it is reused as the correlation id linking the outer
|
||||
// patch entry to the inner stop and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires. Image-reference shape and semver checks happen
|
||||
// later inside Handle so that they run after the runtime record has
|
||||
// been loaded.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.NewImageRef) == "" {
|
||||
return fmt.Errorf("new image ref must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
Docker ports.DockerClient
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step.
|
||||
StopService *stopruntime.Service
|
||||
// StartService runs the inner start step with the new image_ref.
|
||||
StartService *startruntime.Service
|
||||
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
Logger *slog.Logger
|
||||
Clock func() time.Time
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the patch lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new patch runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new patch runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new patch runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new patch runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new patch runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.patchruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one patch operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("patch runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("patch runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected patch sequence: load the
|
||||
// runtime record, validate semver compatibility, run inner stop,
|
||||
// remove the container, run inner start with the new image.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
currentSemver, err := extractSemverTag(existing.CurrentImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
newSemver, err := extractSemverTag(input.NewImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if !samePatchSeries(currentSemver, newSemver) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeSemverPatchOnly,
|
||||
errorMessage: fmt.Sprintf(
|
||||
"patch must keep major.minor; current=%s new=%s",
|
||||
currentSemver, newSemver,
|
||||
),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: input.NewImageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: input.NewImageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_image_ref", existing.CurrentImageRef,
|
||||
"new_image_ref", input.NewImageRef,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure writes the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime patch failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,597 @@
|
||||
package patchruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/patchruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles (mirror the restartruntime test pattern) ---
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in patch tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := []operation.OperationEntry{}
|
||||
for _, entry := range s.appends {
|
||||
if entry.OpKind == kind {
|
||||
out = append(out, entry)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct{}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
|
||||
return ports.LobbyGameRecord{}, nil
|
||||
}
|
||||
|
||||
// --- harness ---------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
|
||||
startService *startruntime.Service
|
||||
stopService *stopruntime.Service
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
h := &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-start-token" },
|
||||
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.startService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-stop-token" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.stopService = stopService
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, tokens ...string) *patchruntime.Service {
|
||||
t.Helper()
|
||||
tokenIdx := 0
|
||||
tokenGen := func() string {
|
||||
if tokenIdx >= len(tokens) {
|
||||
return "outer-fallback"
|
||||
}
|
||||
t := tokens[tokenIdx]
|
||||
tokenIdx++
|
||||
return t
|
||||
}
|
||||
service, err := patchruntime.NewService(patchruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
StopService: h.stopService,
|
||||
StartService: h.startService,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: tokenGen,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
const (
|
||||
currentImage = "registry.example.com/galaxy/game:1.4.7"
|
||||
patchImage = "registry.example.com/galaxy/game:1.4.8"
|
||||
majorBump = "registry.example.com/galaxy/game:2.0.0"
|
||||
tagless = "registry.example.com/galaxy/game"
|
||||
notSemver = "registry.example.com/galaxy/game:latest"
|
||||
)
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: currentImage,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func basicInput() patchruntime.Input {
|
||||
return patchruntime.Input{
|
||||
GameID: "game-1",
|
||||
NewImageRef: patchImage,
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "rest-req-99",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-new",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func expectInnerStart(h *harness, image string) {
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), image, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), image).Return(ports.ImageInspect{Ref: image}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
}
|
||||
|
||||
// --- happy path -----------------------------------------------------
|
||||
|
||||
func TestHandlePatchHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h, patchImage)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, patchImage, result.Record.CurrentImageRef)
|
||||
|
||||
patches := h.operationLogs.byKind(operation.OpKindPatch)
|
||||
require.Len(t, patches, 1)
|
||||
assert.Equal(t, "rest-req-99", patches[0].SourceRef)
|
||||
assert.Equal(t, patchImage, patches[0].ImageRef)
|
||||
assert.Equal(t, "ctr-new", patches[0].ContainerID)
|
||||
|
||||
assert.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
|
||||
assert.Len(t, h.operationLogs.byKind(operation.OpKindStart), 1)
|
||||
}
|
||||
|
||||
func TestHandlePatchSameImageProceedsAsRecreate(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h, currentImage)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = currentImage
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindPatch), 1, "patch entry recorded even when image is unchanged")
|
||||
}
|
||||
|
||||
// --- semver pre-checks ---------------------------------------------
|
||||
|
||||
func TestHandleImageRefNotSemverWhenNewIsTagless(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = tagless
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop), "no inner stop on pre-check failure")
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
}
|
||||
|
||||
func TestHandleImageRefNotSemverWhenNewIsNonSemver(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = notSemver
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleImageRefNotSemverWhenCurrentIsTagless(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
record := runningRecord(h.now)
|
||||
record.CurrentImageRef = tagless
|
||||
h.records.stored["game-1"] = record
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImageRefNotSemver, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleSemverPatchOnlyOnMajorBump(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = majorBump
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
}
|
||||
|
||||
func TestHandleSemverPatchOnlyOnMinorBump(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
input := basicInput()
|
||||
input.NewImageRef = "registry.example.com/galaxy/game:1.5.0"
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeSemverPatchOnly, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- record state checks -------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Hour)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failures from inner ops ---------------------------------------
|
||||
|
||||
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), patchImage, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- conflicts ------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
cases := []patchruntime.Input{
|
||||
{GameID: "", NewImageRef: patchImage, OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", NewImageRef: "", OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", NewImageRef: patchImage, OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor ---------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := patchruntime.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := patchruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,482 @@
|
||||
// Package restartruntime implements the `restart` lifecycle operation
|
||||
// owned by Runtime Manager. Restart is a recreate: under one outer
|
||||
// per-game lease the service runs the stop service, removes the
|
||||
// container with `docker rm`, and runs the start service with the
|
||||
// runtime's current `image_ref`. The hostname / engine endpoint stays
|
||||
// stable across the recreate; `container_id` changes.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Restart`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`, in particular the lease-sharing
|
||||
// pattern with `startruntime.Service.Run` / `stopruntime.Service.Run`,
|
||||
// the correlation-id reuse on `source_ref`, and the
|
||||
// inner-stop-then-rm-failure recovery rule.
|
||||
package restartruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one restart operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to restart.
|
||||
GameID string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (REST
|
||||
// request id, admin user id). When non-empty it is reused as the
|
||||
// correlation id linking the outer restart entry to the inner stop
|
||||
// and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure. Empty for
|
||||
// success.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for success.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads the runtime record at the start of restart
|
||||
// to capture the current image_ref and container_id.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the outer restart audit entry. Inner stop
|
||||
// and start services append their own entries through their own
|
||||
// stores.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the docker rm step between the inner stop and
|
||||
// inner start.
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id. The outer
|
||||
// lease is held for the entire stop + rm + start sequence.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step under the outer lease.
|
||||
StopService *stopruntime.Service
|
||||
|
||||
// StartService runs the inner start step under the outer lease.
|
||||
StartService *startruntime.Service
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records restart outcomes and lease latency. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque token. Used both for the lease
|
||||
// and for the correlation id when Input.SourceRef is empty.
|
||||
// Defaults to a 32-byte random base64url string when nil.
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the restart lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new restart runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new restart runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new restart runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new restart runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new restart runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new restart runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new restart runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.restartruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one restart operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("restart runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("restart runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected restart sequence. Loads
|
||||
// the runtime record, runs inner stop, removes the container, runs
|
||||
// inner start.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot restart", input.GameID),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no image_ref to restart with", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
imageRef := existing.CurrentImageRef
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: imageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: imageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: imageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"image_ref", imageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime restarted", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure records the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindRestart,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: correlationRefOrEmpty(fc.input),
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordRestartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime restart failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// correlationRefOrEmpty returns the original Input.SourceRef for the
|
||||
// outer entry. Outer-failure paths that did not yet generate a
|
||||
// correlation id (input validation, lease busy) keep the original
|
||||
// `source_ref` which is the actor ref.
|
||||
func correlationRefOrEmpty(input Input) string {
|
||||
return input.SourceRef
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background context.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one outer operation_log entry. Inner ops have
|
||||
// already appended their own; a failure here only loses the outer
|
||||
// summary, which is acceptable.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,584 @@
|
||||
package restartruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/restartruntime"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- shared fake doubles ----------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
updateStatusErr error
|
||||
|
||||
upserts []runtime.RuntimeRecord
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in restart tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) byKind(kind operation.OpKind) []operation.OperationEntry {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := []operation.OperationEntry{}
|
||||
for _, entry := range s.appends {
|
||||
if entry.OpKind == kind {
|
||||
out = append(out, entry)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
mu sync.Mutex
|
||||
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
if n.publishErr != nil {
|
||||
return n.publishErr
|
||||
}
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct {
|
||||
record ports.LobbyGameRecord
|
||||
err error
|
||||
}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, _ string) (ports.LobbyGameRecord, error) {
|
||||
if l.err != nil {
|
||||
return ports.LobbyGameRecord{}, l.err
|
||||
}
|
||||
return l.record, nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
|
||||
startService *startruntime.Service
|
||||
stopService *stopruntime.Service
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
h := &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
startService, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-start-token" },
|
||||
PrepareStateDir: func(_ string) (string, error) { return h.stateDir, nil },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.startService = startService
|
||||
|
||||
stopService, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "inner-stop-token" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
h.stopService = stopService
|
||||
|
||||
return h
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T, tokens ...string) *restartruntime.Service {
|
||||
t.Helper()
|
||||
tokenIdx := 0
|
||||
tokenGen := func() string {
|
||||
if tokenIdx >= len(tokens) {
|
||||
return "outer-fallback"
|
||||
}
|
||||
t := tokens[tokenIdx]
|
||||
tokenIdx++
|
||||
return t
|
||||
}
|
||||
service, err := restartruntime.NewService(restartruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
StopService: h.stopService,
|
||||
StartService: h.startService,
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: tokenGen,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
const imageRef = "registry.example.com/galaxy/game:1.4.7"
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-old",
|
||||
CurrentImageRef: imageRef,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func basicInput() restartruntime.Input {
|
||||
return restartruntime.Input{
|
||||
GameID: "game-1",
|
||||
OpSource: operation.OpSourceGMRest,
|
||||
SourceRef: "rest-req-42",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-new",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func expectInnerStart(h *harness) {
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), imageRef).Return(ports.ImageInspect{Ref: imageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleRestartFromRunning(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
assert.Equal(t, imageRef, result.Record.CurrentImageRef)
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1, "inner stop appended its own entry")
|
||||
require.Len(t, starts, 1, "inner start appended its own entry")
|
||||
require.Len(t, restarts, 1, "outer restart appended one summary entry")
|
||||
|
||||
assert.Equal(t, "rest-req-42", stops[0].SourceRef, "correlation id propagated to inner stop")
|
||||
assert.Equal(t, "rest-req-42", starts[0].SourceRef, "correlation id propagated to inner start")
|
||||
assert.Equal(t, "rest-req-42", restarts[0].SourceRef, "correlation id stored on outer restart")
|
||||
assert.Equal(t, "ctr-new", restarts[0].ContainerID)
|
||||
assert.Equal(t, imageRef, restarts[0].ImageRef)
|
||||
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"outer-token"}, h.leases.releases)
|
||||
}
|
||||
|
||||
func TestHandleRestartFromStopped(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
stoppedRecord := runningRecord(h.now)
|
||||
stoppedRecord.Status = runtime.StatusStopped
|
||||
stoppedAt := h.now.Add(-30 * time.Minute)
|
||||
stoppedRecord.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-1"] = stoppedRecord
|
||||
|
||||
// No docker.Stop because inner stop short-circuits via replay no-op.
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, "ctr-new", result.Record.CurrentContainerID)
|
||||
}
|
||||
|
||||
// --- correlation id fallback -----------------------------------------
|
||||
|
||||
func TestHandleGeneratesCorrelationWhenSourceRefEmpty(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
expectInnerStart(h)
|
||||
|
||||
input := basicInput()
|
||||
input.SourceRef = ""
|
||||
|
||||
// First newToken call yields the lease token, second yields the
|
||||
// correlation id fallback.
|
||||
service := h.build(t, "outer-token", "correlation-fallback")
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
|
||||
stops := h.operationLogs.byKind(operation.OpKindStop)
|
||||
starts := h.operationLogs.byKind(operation.OpKindStart)
|
||||
restarts := h.operationLogs.byKind(operation.OpKindRestart)
|
||||
require.Len(t, stops, 1)
|
||||
require.Len(t, starts, 1)
|
||||
require.Len(t, restarts, 1)
|
||||
assert.Equal(t, "correlation-fallback", stops[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", starts[0].SourceRef)
|
||||
assert.Equal(t, "correlation-fallback", restarts[0].SourceRef)
|
||||
}
|
||||
|
||||
// --- failure paths ---------------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStop))
|
||||
assert.Empty(t, h.operationLogs.byKind(operation.OpKindStart))
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandleConflictForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Hour)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStopFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(errors.New("daemon unreachable"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner stop failed")
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerRemoveFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(errors.New("disk i/o"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "docker remove")
|
||||
// inner stop did succeed and write its log entry; outer restart records failure.
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindStop), 1)
|
||||
require.Len(t, h.operationLogs.byKind(operation.OpKindRestart), 1)
|
||||
}
|
||||
|
||||
func TestHandlePropagatesInnerStartFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-old", 30*time.Second).Return(nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-old").Return(nil)
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), imageRef, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t, "outer-token")
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
assert.Contains(t, result.ErrorMessage, "inner start failed")
|
||||
}
|
||||
|
||||
// --- input validation ------------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t, "outer-token")
|
||||
|
||||
cases := []restartruntime.Input{
|
||||
{GameID: "", OpSource: operation.OpSourceGMRest},
|
||||
{GameID: "g", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- constructor -----------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := restartruntime.Dependencies{
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := restartruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package startruntime
|
||||
|
||||
// Stable error codes returned in `Result.ErrorCode`. The values match the
|
||||
// vocabulary frozen by `rtmanager/README.md §Error Model`,
|
||||
// `rtmanager/api/internal-openapi.yaml`, and
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`. Although the constants live
|
||||
// in the start-service package they are the canonical home for every
|
||||
// lifecycle service in `internal/service/`. Stop, restart, patch,
|
||||
// cleanup, the REST handlers, and the stream consumers import these
|
||||
// names rather than redeclare them; renaming any of them is a contract
|
||||
// change.
|
||||
const (
|
||||
// ErrorCodeReplayNoOp reports that the request was an idempotent
|
||||
// replay against an already-running record with the same image_ref.
|
||||
ErrorCodeReplayNoOp = "replay_no_op"
|
||||
|
||||
// ErrorCodeStartConfigInvalid reports that the start request was
|
||||
// rejected before any Docker work because of a validation failure
|
||||
// (invalid image_ref shape, missing Docker network, unwritable state
|
||||
// directory).
|
||||
ErrorCodeStartConfigInvalid = "start_config_invalid"
|
||||
|
||||
// ErrorCodeImagePullFailed reports that the image pull stage failed.
|
||||
ErrorCodeImagePullFailed = "image_pull_failed"
|
||||
|
||||
// ErrorCodeContainerStartFailed reports that `docker create` or
|
||||
// `docker start` failed, or that the runtime record could not be
|
||||
// installed after a successful Run.
|
||||
ErrorCodeContainerStartFailed = "container_start_failed"
|
||||
|
||||
// ErrorCodeConflict reports an operation incompatible with the
|
||||
// current runtime state (lease busy, running record with a different
|
||||
// image_ref, cleanup attempted on a running runtime, restart or
|
||||
// patch attempted on a removed record).
|
||||
ErrorCodeConflict = "conflict"
|
||||
|
||||
// ErrorCodeServiceUnavailable reports that a steady-state dependency
|
||||
// (Docker daemon, PostgreSQL, Redis) was unreachable for this call.
|
||||
ErrorCodeServiceUnavailable = "service_unavailable"
|
||||
|
||||
// ErrorCodeInternal reports an unexpected error not classified by
|
||||
// the other codes.
|
||||
ErrorCodeInternal = "internal_error"
|
||||
|
||||
// ErrorCodeInvalidRequest reports that the request was rejected
|
||||
// because of structural input validation (empty required fields,
|
||||
// unknown enum values). Used by the stop / restart / patch /
|
||||
// cleanup services for malformed Input. The start service uses the
|
||||
// stricter `start_config_invalid` code instead because every start
|
||||
// validation failure also raises an admin notification intent.
|
||||
ErrorCodeInvalidRequest = "invalid_request"
|
||||
|
||||
// ErrorCodeNotFound reports that the runtime record requested by a
|
||||
// stop, restart, patch or cleanup operation does not exist. Those
|
||||
// services raise it; the start service never does (start installs
|
||||
// the record on first call).
|
||||
ErrorCodeNotFound = "not_found"
|
||||
|
||||
// ErrorCodeImageRefNotSemver reports that a patch operation was
|
||||
// rejected because either the current or the new image reference
|
||||
// could not be parsed as a semver tag.
|
||||
ErrorCodeImageRefNotSemver = "image_ref_not_semver"
|
||||
|
||||
// ErrorCodeSemverPatchOnly reports that a patch operation was
|
||||
// rejected because the major or minor component differs between the
|
||||
// current and new image references.
|
||||
ErrorCodeSemverPatchOnly = "semver_patch_only"
|
||||
)
|
||||
@@ -0,0 +1,940 @@
|
||||
// Package startruntime implements the `start` lifecycle operation owned
|
||||
// by Runtime Manager. The service is the single orchestrator behind
|
||||
// both the asynchronous `runtime:start_jobs` consumer and the
|
||||
// synchronous `POST /api/v1/internal/runtimes/{game_id}/start` REST
|
||||
// handler; both callers obtain a deterministic Result with a stable
|
||||
// `Outcome` / `ErrorCode` pair.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Start`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package startruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/distribution/reference"
|
||||
)
|
||||
|
||||
// Container labels applied to every engine container created by the
|
||||
// start service. Frozen by `rtmanager/README.md §Container Model`.
|
||||
const (
|
||||
LabelOwner = "com.galaxy.owner"
|
||||
LabelOwnerValue = "rtmanager"
|
||||
LabelKind = "com.galaxy.kind"
|
||||
LabelKindValue = "game-engine"
|
||||
LabelGameID = "com.galaxy.game_id"
|
||||
LabelEngineImageRef = "com.galaxy.engine_image_ref"
|
||||
LabelStartedAtMs = "com.galaxy.started_at_ms"
|
||||
|
||||
// Image labels read at start time to derive resource limits.
|
||||
imageLabelCPUQuota = "com.galaxy.cpu_quota"
|
||||
imageLabelMemory = "com.galaxy.memory"
|
||||
imageLabelPIDsLimit = "com.galaxy.pids_limit"
|
||||
|
||||
// HostnamePrefix is the constant prefix used to build the per-game
|
||||
// container hostname (`galaxy-game-{game_id}`). The full hostname
|
||||
// also forms the container name; restart and patch keep the same
|
||||
// value so the engine endpoint stays stable across container
|
||||
// recreates.
|
||||
HostnamePrefix = "galaxy-game-"
|
||||
|
||||
// EngineStateBackCompatEnvName is the secondary env var name v1
|
||||
// engines accept for the bind-mounted state directory. Always set
|
||||
// alongside the configured primary name to honour the v1 backward
|
||||
// compatibility commitment in `rtmanager/README.md §Container Model`.
|
||||
EngineStateBackCompatEnvName = "STORAGE_PATH"
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A
|
||||
// fresh background context is used so the release runs even when
|
||||
// the request context was already canceled.
|
||||
leaseReleaseTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
// Input stores the per-call arguments for one start operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to start.
|
||||
GameID string
|
||||
|
||||
// ImageRef stores the producer-resolved Docker reference of the
|
||||
// engine image. Validated against `distribution/reference` before
|
||||
// any Docker work.
|
||||
ImageRef string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference
|
||||
// (Redis Stream entry id, REST request id, admin user id). Empty
|
||||
// when the caller does not provide one.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.ImageRef) == "" {
|
||||
return fmt.Errorf("image ref must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the operation.
|
||||
// Populated on success and on idempotent replay (`replay_no_op`);
|
||||
// zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for successes.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads and installs the durable runtime record.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the success / failure audit entry.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the Docker daemon (network check, pull, inspect,
|
||||
// run, remove).
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// HealthEvents publishes `runtime:health_events` and upserts the
|
||||
// matching `health_snapshots` row.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Notifications publishes admin-only failure intents.
|
||||
Notifications ports.NotificationIntentPublisher
|
||||
|
||||
// Lobby provides best-effort diagnostic context for the started
|
||||
// game. May be nil; the start operation does not depend on it.
|
||||
Lobby ports.LobbyInternalClient
|
||||
|
||||
// Container groups the per-container defaults and state-directory
|
||||
// settings consumed at start time.
|
||||
Container config.ContainerConfig
|
||||
|
||||
// Docker groups the Docker daemon settings (network, log driver,
|
||||
// pull policy) consumed at start time.
|
||||
DockerCfg config.DockerConfig
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records start outcomes, lease latency, and health
|
||||
// event counters. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque lease token. Defaults to a
|
||||
// 32-byte random base64url string when nil. Tests may override.
|
||||
NewToken func() string
|
||||
|
||||
// PrepareStateDir creates the per-game state directory and
|
||||
// returns its absolute host path. Defaults to a real-filesystem
|
||||
// implementation that honours Container.GameStateRoot,
|
||||
// Container.GameStateDirMode, and Container.GameStateOwner{UID,GID}.
|
||||
// Tests override to point at a temporary directory.
|
||||
PrepareStateDir func(gameID string) (string, error)
|
||||
}
|
||||
|
||||
// Service executes the start lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
notifications ports.NotificationIntentPublisher
|
||||
lobby ports.LobbyInternalClient
|
||||
|
||||
containerCfg config.ContainerConfig
|
||||
dockerCfg config.DockerConfig
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
prepareStateDir func(gameID string) (string, error)
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new start runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new start runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new start runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new start runtime service: nil lease store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new start runtime service: nil health events publisher")
|
||||
case deps.Notifications == nil:
|
||||
return nil, errors.New("new start runtime service: nil notification publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new start runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Container.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: container config: %w", err)
|
||||
}
|
||||
if err := deps.DockerCfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: docker config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new start runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.startruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
prepareStateDir := deps.PrepareStateDir
|
||||
if prepareStateDir == nil {
|
||||
prepareStateDir = newDefaultStateDirPreparer(deps.Container)
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
healthEvents: deps.HealthEvents,
|
||||
notifications: deps.Notifications,
|
||||
lobby: deps.Lobby,
|
||||
containerCfg: deps.Container,
|
||||
dockerCfg: deps.DockerCfg,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
prepareStateDir: prepareStateDir,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one start operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — fresh success, idempotent
|
||||
// replay, or any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("start runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("start runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// Run executes the start lifecycle assuming the per-game lease is
|
||||
// already held by the caller. The method is reserved for orchestrator
|
||||
// services in `internal/service/` that compose start with another
|
||||
// operation under a single outer lease (restart and patch). External
|
||||
// callers must use Handle, which acquires and releases the lease
|
||||
// itself.
|
||||
//
|
||||
// Run still validates input and reports business outcomes through
|
||||
// Result; the Go-level error return is reserved for non-business
|
||||
// failures (nil context, nil receiver). Operation log entries,
|
||||
// telemetry counters, health events and admin-only notification
|
||||
// intents fire identically to Handle.
|
||||
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("start runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("start runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the post-validation, lease-protected start
|
||||
// steps shared by Handle and Run. Callers must validate input and
|
||||
// acquire the lease (when applicable) before invocation.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, hasExisting, err := service.loadExisting(ctx, input.GameID)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if hasExisting && existing.Status == runtime.StatusRunning {
|
||||
if existing.CurrentImageRef == input.ImageRef {
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime already running with image_ref %q", existing.CurrentImageRef),
|
||||
}), nil
|
||||
}
|
||||
|
||||
service.fetchLobbyDiagnostic(ctx, input.GameID)
|
||||
|
||||
if err := validateImageRef(input.ImageRef); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("invalid image_ref: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.EnsureNetwork(ctx, service.dockerCfg.Network); err != nil {
|
||||
if errors.Is(err, ports.ErrNetworkMissing) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("docker network %q is missing", service.dockerCfg.Network),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("ensure docker network: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.PullImage(ctx, input.ImageRef, ports.PullPolicy(service.dockerCfg.PullPolicy)); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeImagePullFailed,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
imageInspect, err := service.docker.InspectImage(ctx, input.ImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeImagePullFailed,
|
||||
errorMessage: fmt.Sprintf("inspect image: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeImagePullFailed,
|
||||
}), nil
|
||||
}
|
||||
cpuQuota, memory, pidsLimit := service.resolveLimits(imageInspect.Labels)
|
||||
|
||||
statePath, err := service.prepareStateDir(input.GameID)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeStartConfigInvalid,
|
||||
errorMessage: fmt.Sprintf("prepare state directory: %s", err.Error()),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeStartConfigInvalid,
|
||||
}), nil
|
||||
}
|
||||
|
||||
hostname := containerHostname(input.GameID)
|
||||
spec := ports.RunSpec{
|
||||
Name: hostname,
|
||||
Image: input.ImageRef,
|
||||
Hostname: hostname,
|
||||
Network: service.dockerCfg.Network,
|
||||
Env: service.buildEnv(),
|
||||
Labels: service.buildLabels(input.GameID, input.ImageRef, opStartedAt),
|
||||
BindMounts: []ports.BindMount{{
|
||||
HostPath: statePath,
|
||||
MountPath: service.containerCfg.EngineStateMountPath,
|
||||
ReadOnly: false,
|
||||
}},
|
||||
LogDriver: service.dockerCfg.LogDriver,
|
||||
LogOpts: parseLogOpts(service.dockerCfg.LogOpts),
|
||||
CPUQuota: cpuQuota,
|
||||
Memory: memory,
|
||||
PIDsLimit: pidsLimit,
|
||||
}
|
||||
runResult, err := service.docker.Run(ctx, spec)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeContainerStartFailed,
|
||||
errorMessage: err.Error(),
|
||||
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
createdAt := opStartedAt
|
||||
if hasExisting && !existing.CreatedAt.IsZero() {
|
||||
createdAt = existing.CreatedAt
|
||||
}
|
||||
startedAt := runResult.StartedAt
|
||||
record := runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: runResult.ContainerID,
|
||||
CurrentImageRef: input.ImageRef,
|
||||
EngineEndpoint: runResult.EngineEndpoint,
|
||||
StatePath: statePath,
|
||||
DockerNetwork: service.dockerCfg.Network,
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: createdAt,
|
||||
}
|
||||
if err := service.runtimeRecords.Upsert(ctx, record); err != nil {
|
||||
service.bestEffortRemove(input.GameID, runResult.ContainerID)
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: ErrorCodeContainerStartFailed,
|
||||
errorMessage: fmt.Sprintf("upsert runtime record: %s", err.Error()),
|
||||
containerID: runResult.ContainerID,
|
||||
notificationType: notificationintent.NotificationTypeRuntimeContainerStartFailed,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: input.ImageRef,
|
||||
ContainerID: runResult.ContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: input.GameID,
|
||||
ContainerID: runResult.ContainerID,
|
||||
EventType: health.EventTypeContainerStarted,
|
||||
OccurredAt: startedAt,
|
||||
Details: containerStartedDetails(input.ImageRef),
|
||||
})
|
||||
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), "", string(input.OpSource))
|
||||
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerStarted))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", runResult.ContainerID,
|
||||
"image_ref", input.ImageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime started", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure so the Handle method
|
||||
// stays readable.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
notificationType notificationintent.NotificationType
|
||||
}
|
||||
|
||||
// recordFailure records the failure operation_log entry, publishes the
|
||||
// matching admin-only notification intent (when applicable), and emits
|
||||
// telemetry. All side effects are best-effort; a downstream failure is
|
||||
// logged but does not change the returned Result.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
|
||||
if fc.notificationType != "" {
|
||||
service.bestEffortNotify(ctx, fc)
|
||||
}
|
||||
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode, string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.input.ImageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime start failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns
|
||||
// the existing record. The operation_log entry is appended best-effort
|
||||
// so audit history captures the replay; telemetry counts the call as a
|
||||
// successful start with `error_code=replay_no_op`.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStart,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: input.ImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStartOutcome(ctx, string(operation.OutcomeSuccess), ErrorCodeReplayNoOp, string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"image_ref", input.ImageRef,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime start replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// loadExisting reads the runtime record for gameID. The boolean return
|
||||
// reports whether a record exists; ErrNotFound is translated to
|
||||
// (zero, false, nil) so the caller does not branch on the sentinel
|
||||
// elsewhere.
|
||||
func (service *Service) loadExisting(ctx context.Context, gameID string) (runtime.RuntimeRecord, bool, error) {
|
||||
record, err := service.runtimeRecords.Get(ctx, gameID)
|
||||
switch {
|
||||
case errors.Is(err, runtime.ErrNotFound):
|
||||
return runtime.RuntimeRecord{}, false, nil
|
||||
case err != nil:
|
||||
return runtime.RuntimeRecord{}, false, err
|
||||
default:
|
||||
return record, true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// fetchLobbyDiagnostic best-effort enriches the request log with the
|
||||
// Lobby-side game record. A nil Lobby client or any transport failure
|
||||
// is logged and the start operation continues.
|
||||
func (service *Service) fetchLobbyDiagnostic(ctx context.Context, gameID string) {
|
||||
if service.lobby == nil {
|
||||
return
|
||||
}
|
||||
record, err := service.lobby.GetGame(ctx, gameID)
|
||||
if err != nil {
|
||||
service.logger.DebugContext(ctx, "lobby diagnostic fetch failed",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
service.logger.DebugContext(ctx, "lobby diagnostic fetched",
|
||||
"game_id", gameID,
|
||||
"lobby_status", record.Status,
|
||||
"lobby_target_engine_version", record.TargetEngineVersion,
|
||||
)
|
||||
}
|
||||
|
||||
// resolveLimits derives the per-container resource limits from the
|
||||
// resolved image's labels with config-driven fallbacks. Unparseable
|
||||
// label values silently fall back to the configured default; operators
|
||||
// see the chosen value through `rtmanager.docker_op_latency` and start
|
||||
// logs.
|
||||
func (service *Service) resolveLimits(labels map[string]string) (cpuQuota float64, memory string, pidsLimit int) {
|
||||
cpuQuota = service.containerCfg.DefaultCPUQuota
|
||||
memory = service.containerCfg.DefaultMemory
|
||||
pidsLimit = service.containerCfg.DefaultPIDsLimit
|
||||
|
||||
if raw, ok := labels[imageLabelCPUQuota]; ok {
|
||||
if value, err := strconv.ParseFloat(raw, 64); err == nil && value > 0 {
|
||||
cpuQuota = value
|
||||
}
|
||||
}
|
||||
if raw, ok := labels[imageLabelMemory]; ok && strings.TrimSpace(raw) != "" {
|
||||
memory = raw
|
||||
}
|
||||
if raw, ok := labels[imageLabelPIDsLimit]; ok {
|
||||
if value, err := strconv.Atoi(raw); err == nil && value > 0 {
|
||||
pidsLimit = value
|
||||
}
|
||||
}
|
||||
return cpuQuota, memory, pidsLimit
|
||||
}
|
||||
|
||||
// buildEnv assembles the env-var map handed to the engine. Both the
|
||||
// configured primary name and `STORAGE_PATH` are set per
|
||||
// `rtmanager/README.md §Container Model` v1 backward compatibility.
|
||||
func (service *Service) buildEnv() map[string]string {
|
||||
mount := service.containerCfg.EngineStateMountPath
|
||||
env := map[string]string{
|
||||
service.containerCfg.EngineStateEnvName: mount,
|
||||
EngineStateBackCompatEnvName: mount,
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
// buildLabels assembles the container labels per
|
||||
// `rtmanager/README.md §Container Model`.
|
||||
func (service *Service) buildLabels(gameID, imageRef string, startedAt time.Time) map[string]string {
|
||||
return map[string]string{
|
||||
LabelOwner: LabelOwnerValue,
|
||||
LabelKind: LabelKindValue,
|
||||
LabelGameID: gameID,
|
||||
LabelEngineImageRef: imageRef,
|
||||
LabelStartedAtMs: strconv.FormatInt(startedAt.UTC().UnixMilli(), 10),
|
||||
}
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background
|
||||
// context so a canceled request context does not leave the lease
|
||||
// pinned for its TTL.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortNotify publishes one admin-only failure intent. Failures
|
||||
// degrade silently because the source business state already reflects
|
||||
// the outcome.
|
||||
func (service *Service) bestEffortNotify(ctx context.Context, fc failureCtx) {
|
||||
intent, err := buildFailureIntent(fc, service.clock().UTC())
|
||||
if err != nil {
|
||||
service.logger.ErrorContext(ctx, "build notification intent",
|
||||
"game_id", fc.input.GameID,
|
||||
"notification_type", string(fc.notificationType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
if err := service.notifications.Publish(ctx, intent); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish notification intent",
|
||||
"game_id", fc.input.GameID,
|
||||
"notification_type", string(fc.notificationType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
return
|
||||
}
|
||||
service.telemetry.RecordNotificationIntent(ctx, string(fc.notificationType))
|
||||
}
|
||||
|
||||
// bestEffortRemove forces removal of a container left running by a
|
||||
// failed start that progressed past Run but failed to register the
|
||||
// runtime record. Failures degrade silently — the reconciler adopts
|
||||
// orphans the periodic pass observes.
|
||||
func (service *Service) bestEffortRemove(gameID, containerID string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.docker.Remove(cleanupCtx, containerID); err != nil {
|
||||
service.logger.ErrorContext(cleanupCtx, "rollback container after upsert failure",
|
||||
"game_id", gameID,
|
||||
"container_id", containerID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// containerHostname builds the per-game hostname that doubles as the
|
||||
// Docker container name.
|
||||
func containerHostname(gameID string) string {
|
||||
return HostnamePrefix + gameID
|
||||
}
|
||||
|
||||
// containerStartedDetails builds the `details` payload required by the
|
||||
// `container_started` AsyncAPI variant.
|
||||
func containerStartedDetails(imageRef string) json.RawMessage {
|
||||
payload := map[string]string{"image_ref": imageRef}
|
||||
encoded, _ := json.Marshal(payload)
|
||||
return encoded
|
||||
}
|
||||
|
||||
// validateImageRef rejects malformed Docker references before any
|
||||
// daemon round-trip. The validation surfaces as `start_config_invalid`;
|
||||
// daemon-side rejections after a valid parse are reported as
|
||||
// `image_pull_failed`.
|
||||
func validateImageRef(ref string) error {
|
||||
if strings.TrimSpace(ref) == "" {
|
||||
return fmt.Errorf("image ref must not be empty")
|
||||
}
|
||||
if _, err := reference.ParseNormalizedNamed(ref); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseLogOpts turns the `key=value,key2=value2` shape of the
|
||||
// `RTMANAGER_DOCKER_LOG_OPTS` config into a map suitable for the
|
||||
// Docker SDK. Empty input returns nil so the SDK uses driver defaults.
|
||||
func parseLogOpts(raw string) map[string]string {
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]string)
|
||||
for part := range strings.SplitSeq(raw, ",") {
|
||||
entry := strings.TrimSpace(part)
|
||||
if entry == "" {
|
||||
continue
|
||||
}
|
||||
index := strings.IndexByte(entry, '=')
|
||||
if index <= 0 {
|
||||
continue
|
||||
}
|
||||
out[entry[:index]] = entry[index+1:]
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// buildFailureIntent constructs the admin-only notification intent for
|
||||
// fc. The idempotency key is scoped per (notification_type, game_id,
|
||||
// image_ref, attempted_at_ms) so the same failure observed twice is
|
||||
// recognised as a duplicate by Notification Service.
|
||||
func buildFailureIntent(fc failureCtx, attemptedAt time.Time) (notificationintent.Intent, error) {
|
||||
attemptedAtMs := attemptedAt.UnixMilli()
|
||||
idempotencyKey := fmt.Sprintf("%s.%s.%d", fc.notificationType, fc.input.GameID, attemptedAtMs)
|
||||
metadata := notificationintent.Metadata{
|
||||
IdempotencyKey: idempotencyKey,
|
||||
OccurredAt: attemptedAt,
|
||||
}
|
||||
|
||||
switch fc.notificationType {
|
||||
case notificationintent.NotificationTypeRuntimeImagePullFailed:
|
||||
return notificationintent.NewRuntimeImagePullFailedIntent(metadata, notificationintent.RuntimeImagePullFailedPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
case notificationintent.NotificationTypeRuntimeContainerStartFailed:
|
||||
return notificationintent.NewRuntimeContainerStartFailedIntent(metadata, notificationintent.RuntimeContainerStartFailedPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
case notificationintent.NotificationTypeRuntimeStartConfigInvalid:
|
||||
return notificationintent.NewRuntimeStartConfigInvalidIntent(metadata, notificationintent.RuntimeStartConfigInvalidPayload{
|
||||
GameID: fc.input.GameID,
|
||||
ImageRef: fc.input.ImageRef,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
AttemptedAtMs: attemptedAtMs,
|
||||
})
|
||||
default:
|
||||
return notificationintent.Intent{}, fmt.Errorf("unsupported notification type %q", fc.notificationType)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens. The randomness source is `crypto/rand`;
|
||||
// failures fall back to a deterministic-looking but invalid token so
|
||||
// the caller observes a TryAcquire collision rather than a panic on a
|
||||
// degraded entropy source.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
|
||||
// newDefaultStateDirPreparer returns a function that creates the
|
||||
// per-game state directory under cfg.GameStateRoot with the configured
|
||||
// permissions and ownership. The function is overridable through
|
||||
// Dependencies.PrepareStateDir; tests inject a temporary-dir fake.
|
||||
func newDefaultStateDirPreparer(cfg config.ContainerConfig) func(gameID string) (string, error) {
|
||||
mode := os.FileMode(cfg.GameStateDirMode)
|
||||
uid := cfg.GameStateOwnerUID
|
||||
gid := cfg.GameStateOwnerGID
|
||||
root := cfg.GameStateRoot
|
||||
return func(gameID string) (string, error) {
|
||||
path := filepath.Join(root, gameID)
|
||||
if err := os.MkdirAll(path, mode); err != nil {
|
||||
return "", fmt.Errorf("create state dir %q: %w", path, err)
|
||||
}
|
||||
if err := os.Chmod(path, mode); err != nil {
|
||||
return "", fmt.Errorf("chmod state dir %q: %w", path, err)
|
||||
}
|
||||
if err := os.Chown(path, uid, gid); err != nil {
|
||||
return "", fmt.Errorf("chown state dir %q: %w", path, err)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,693 @@
|
||||
package startruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- test doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
upsertErr error
|
||||
upserts []runtime.RuntimeRecord
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, record runtime.RuntimeRecord) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.upsertErr != nil {
|
||||
return s.upsertErr
|
||||
}
|
||||
s.upserts = append(s.upserts, record)
|
||||
s.stored[record.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, _ ports.UpdateStatusInput) error {
|
||||
return errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in start tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
mu sync.Mutex
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeNotifications struct {
|
||||
mu sync.Mutex
|
||||
publishErr error
|
||||
intents []notificationintent.Intent
|
||||
}
|
||||
|
||||
func (n *fakeNotifications) Publish(_ context.Context, intent notificationintent.Intent) error {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
if n.publishErr != nil {
|
||||
return n.publishErr
|
||||
}
|
||||
n.intents = append(n.intents, intent)
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeLobby struct {
|
||||
record ports.LobbyGameRecord
|
||||
err error
|
||||
|
||||
mu sync.Mutex
|
||||
calls []string
|
||||
}
|
||||
|
||||
func (l *fakeLobby) GetGame(_ context.Context, gameID string) (ports.LobbyGameRecord, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.calls = append(l.calls, gameID)
|
||||
if l.err != nil {
|
||||
return ports.LobbyGameRecord{}, l.err
|
||||
}
|
||||
return l.record, nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
notifications *fakeNotifications
|
||||
lobby *fakeLobby
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
stateDir string
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
notifications: &fakeNotifications{},
|
||||
lobby: &fakeLobby{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
stateDir: "/var/lib/galaxy/games/game-1",
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *startruntime.Service {
|
||||
t.Helper()
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
dockerCfg := config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
service, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: containerCfg,
|
||||
DockerCfg: dockerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return h.stateDir, nil
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() startruntime.Input {
|
||||
return startruntime.Input{
|
||||
GameID: "game-1",
|
||||
ImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: "1700000000000-0",
|
||||
}
|
||||
}
|
||||
|
||||
func sampleRunResult(now time.Time) ports.RunResult {
|
||||
return ports.RunResult{
|
||||
ContainerID: "ctr-123",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StartedAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, ports.PullPolicy(config.ImagePullPolicyIfMissing)).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{
|
||||
Ref: input.ImageRef,
|
||||
Labels: map[string]string{
|
||||
"com.galaxy.cpu_quota": "0.5",
|
||||
"com.galaxy.memory": "256m",
|
||||
"com.galaxy.pids_limit": "256",
|
||||
},
|
||||
}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).DoAndReturn(func(_ context.Context, spec ports.RunSpec) (ports.RunResult, error) {
|
||||
assert.Equal(t, "galaxy-game-game-1", spec.Name)
|
||||
assert.Equal(t, "galaxy-game-game-1", spec.Hostname)
|
||||
assert.Equal(t, input.ImageRef, spec.Image)
|
||||
assert.Equal(t, "galaxy-net", spec.Network)
|
||||
assert.Equal(t, "json-file", spec.LogDriver)
|
||||
assert.InDelta(t, 0.5, spec.CPUQuota, 0)
|
||||
assert.Equal(t, "256m", spec.Memory)
|
||||
assert.Equal(t, 256, spec.PIDsLimit)
|
||||
assert.Equal(t, h.stateDir, spec.BindMounts[0].HostPath)
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.BindMounts[0].MountPath)
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["GAME_STATE_PATH"])
|
||||
assert.Equal(t, "/var/lib/galaxy-game", spec.Env["STORAGE_PATH"])
|
||||
assert.Equal(t, "rtmanager", spec.Labels[startruntime.LabelOwner])
|
||||
assert.Equal(t, "game-engine", spec.Labels[startruntime.LabelKind])
|
||||
assert.Equal(t, input.GameID, spec.Labels[startruntime.LabelGameID])
|
||||
assert.Equal(t, input.ImageRef, spec.Labels[startruntime.LabelEngineImageRef])
|
||||
return sampleRunResult(h.now), nil
|
||||
})
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
assert.Equal(t, "ctr-123", result.Record.CurrentContainerID)
|
||||
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
|
||||
assert.Equal(t, "http://galaxy-game-game-1:8080", result.Record.EngineEndpoint)
|
||||
assert.Equal(t, h.stateDir, result.Record.StatePath)
|
||||
assert.Equal(t, "galaxy-net", result.Record.DockerNetwork)
|
||||
require.NotNil(t, result.Record.StartedAt)
|
||||
assert.Equal(t, h.now, *result.Record.StartedAt)
|
||||
assert.Equal(t, h.now, result.Record.LastOpAt)
|
||||
assert.Equal(t, h.now, result.Record.CreatedAt)
|
||||
|
||||
require.Len(t, h.records.upserts, 1)
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindStart, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
|
||||
require.Len(t, h.healthEvents.envelopes, 1)
|
||||
assert.Equal(t, health.EventTypeContainerStarted, h.healthEvents.envelopes[0].EventType)
|
||||
var details map[string]string
|
||||
require.NoError(t, json.Unmarshal(h.healthEvents.envelopes[0].Details, &details))
|
||||
assert.Equal(t, input.ImageRef, details["image_ref"])
|
||||
|
||||
assert.Empty(t, h.notifications.intents, "no notification intent expected on success")
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
assert.Equal(t, []string{input.GameID}, h.lobby.calls)
|
||||
}
|
||||
|
||||
// --- idempotent replay ------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForRunningRecordWithSameImageRef(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-prev",
|
||||
CurrentImageRef: input.ImageRef,
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Equal(t, "ctr-prev", result.Record.CurrentContainerID)
|
||||
|
||||
assert.Empty(t, h.records.upserts, "replay must not Upsert a fresh record")
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-prev", last.ContainerID)
|
||||
assert.Empty(t, h.notifications.intents)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases, "lease must be released after replay no-op")
|
||||
}
|
||||
|
||||
// --- conflicts --------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
input := basicInput()
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.notifications.intents, "lease conflicts must not raise admin notifications")
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandleConflictWhenRunningWithDifferentImageRef(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
startedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-prev",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, last.ErrorCode)
|
||||
assert.Empty(t, h.notifications.intents)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
// --- start_config_invalid ---------------------------------------------
|
||||
|
||||
func TestHandleStartConfigInvalidWhenImageRefMalformed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
input.ImageRef = "::not a docker reference::"
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
}
|
||||
|
||||
func TestHandleStartConfigInvalidWhenNetworkMissing(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(ports.ErrNetworkMissing)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
func TestHandleStartConfigInvalidWhenStateDirFails(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
|
||||
service, err := startruntime.NewService(startruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Notifications: h.notifications,
|
||||
Lobby: h.lobby,
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
PrepareStateDir: func(_ string) (string, error) {
|
||||
return "", errors.New("disk full")
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeStartConfigInvalid, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
// --- image_pull_failed ------------------------------------------------
|
||||
|
||||
func TestHandleImagePullFailed(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(errors.New("manifest unknown"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeImagePullFailed, h.notifications.intents[0].NotificationType)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
// --- container_start_failed ------------------------------------------
|
||||
|
||||
func TestHandleContainerStartFailedOnRunError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(ports.RunResult{}, errors.New("container name conflict"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
|
||||
assert.Empty(t, h.records.upserts)
|
||||
}
|
||||
|
||||
func TestHandleRollsBackContainerWhenUpsertFails(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.upsertErr = errors.New("connection refused")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
h.docker.EXPECT().Remove(gomock.Any(), "ctr-123").Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeContainerStartFailed, result.ErrorCode)
|
||||
require.Len(t, h.notifications.intents, 1)
|
||||
assert.Equal(t, notificationintent.NotificationTypeRuntimeContainerStartFailed, h.notifications.intents[0].NotificationType)
|
||||
}
|
||||
|
||||
// --- best-effort degradation -----------------------------------------
|
||||
|
||||
func TestHandleSuccessSurvivesOperationLogFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.operationLogs.appendErr = errors.New("postgres down")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Len(t, h.records.upserts, 1)
|
||||
}
|
||||
|
||||
func TestHandleSuccessSurvivesHealthPublishFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.healthEvents.publishErr = errors.New("redis down")
|
||||
input := basicInput()
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Len(t, h.records.upserts, 1)
|
||||
}
|
||||
|
||||
// --- pre-existing stopped record proceeds with fresh start ----------
|
||||
|
||||
func TestHandlePreservesCreatedAtForExistingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
input := basicInput()
|
||||
originalCreatedAt := h.now.Add(-72 * time.Hour)
|
||||
stoppedAt := h.now.Add(-time.Hour)
|
||||
h.records.stored[input.GameID] = runtime.RuntimeRecord{
|
||||
GameID: input.GameID,
|
||||
Status: runtime.StatusStopped,
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.6",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: h.stateDir,
|
||||
DockerNetwork: "galaxy-net",
|
||||
StoppedAt: &stoppedAt,
|
||||
LastOpAt: stoppedAt,
|
||||
CreatedAt: originalCreatedAt,
|
||||
}
|
||||
|
||||
h.docker.EXPECT().EnsureNetwork(gomock.Any(), "galaxy-net").Return(nil)
|
||||
h.docker.EXPECT().PullImage(gomock.Any(), input.ImageRef, gomock.Any()).Return(nil)
|
||||
h.docker.EXPECT().InspectImage(gomock.Any(), input.ImageRef).Return(ports.ImageInspect{Ref: input.ImageRef}, nil)
|
||||
h.docker.EXPECT().Run(gomock.Any(), gomock.Any()).Return(sampleRunResult(h.now), nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, originalCreatedAt, result.Record.CreatedAt, "created_at must be preserved across re-starts")
|
||||
assert.Equal(t, runtime.StatusRunning, result.Record.Status)
|
||||
assert.Equal(t, input.ImageRef, result.Record.CurrentImageRef)
|
||||
}
|
||||
|
||||
// --- input validation -----------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []startruntime.Input{
|
||||
{GameID: "", ImageRef: "x", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", ImageRef: "", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", ImageRef: "x", OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeStartConfigInvalid, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := startruntime.Dependencies{
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
DockerCfg: config.DockerConfig{
|
||||
Host: "unix:///var/run/docker.sock",
|
||||
Network: "galaxy-net",
|
||||
LogDriver: "json-file",
|
||||
PullPolicy: config.ImagePullPolicyIfMissing,
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := startruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,612 @@
|
||||
// Package stopruntime implements the `stop` lifecycle operation owned by
|
||||
// Runtime Manager. The service is the single orchestrator behind both
|
||||
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
|
||||
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
|
||||
// also the inner stop step of the restart and patch services, which
|
||||
// call Run while holding the outer per-game lease.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Stop`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package stopruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
|
||||
// background context is used so the release runs even when the request
|
||||
// context was already canceled.
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one stop operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to stop.
|
||||
GameID string
|
||||
|
||||
// Reason classifies the trigger of the stop. Required.
|
||||
Reason StopReason
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
// Required: every operation_log entry carries an op_source.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference (Redis
|
||||
// Stream entry id, REST request id, admin user id). Empty when the
|
||||
// caller does not provide one. For inner calls invoked by the
|
||||
// restart and patch orchestrators it carries the outer correlation
|
||||
// id so the three operation_log entries share it.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
if err := input.Reason.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle / Run call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the operation.
|
||||
// Populated on success and on idempotent replay; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure, or
|
||||
// `replay_no_op` on idempotent replay. Empty for fresh successes.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
// Empty for successes.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
// RuntimeRecords reads and updates the durable runtime record.
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
|
||||
// OperationLogs records the success / failure audit entry.
|
||||
OperationLogs ports.OperationLogStore
|
||||
|
||||
// Docker drives the Docker daemon (container stop).
|
||||
Docker ports.DockerClient
|
||||
|
||||
// Leases serialises operations against the same game id.
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// HealthEvents publishes `runtime:health_events` and upserts the
|
||||
// matching `health_snapshots` row. Used on the vanished-container
|
||||
// path to emit `container_disappeared`.
|
||||
HealthEvents ports.HealthEventPublisher
|
||||
|
||||
// Container groups the per-container settings consumed at stop time
|
||||
// (the graceful stop timeout).
|
||||
Container config.ContainerConfig
|
||||
|
||||
// Coordination supplies the per-game lease TTL.
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
// Telemetry records stop outcomes and lease latency. Required.
|
||||
Telemetry *telemetry.Runtime
|
||||
|
||||
// Logger records structured service-level events. Defaults to
|
||||
// `slog.Default()` when nil.
|
||||
Logger *slog.Logger
|
||||
|
||||
// Clock supplies the wall-clock used for operation timestamps.
|
||||
// Defaults to `time.Now` when nil.
|
||||
Clock func() time.Time
|
||||
|
||||
// NewToken supplies a unique opaque lease token. Defaults to a
|
||||
// 32-byte random base64url string when nil. Tests may override.
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the stop lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
healthEvents ports.HealthEventPublisher
|
||||
|
||||
stopTimeout time.Duration
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new stop runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new stop runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new stop runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new stop runtime service: nil lease store")
|
||||
case deps.HealthEvents == nil:
|
||||
return nil, errors.New("new stop runtime service: nil health events publisher")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new stop runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Container.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.stopruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
healthEvents: deps.HealthEvents,
|
||||
stopTimeout: deps.Container.StopTimeout,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one stop operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success, idempotent replay, or
|
||||
// any of the stable failure modes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("stop runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("stop runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// Run executes the stop lifecycle assuming the per-game lease is
|
||||
// already held by the caller. The method is reserved for orchestrator
|
||||
// services in `internal/service/` that compose stop with another
|
||||
// operation under a single outer lease (restart and patch). External
|
||||
// callers must use Handle.
|
||||
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("stop runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("stop runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the post-validation, lease-protected stop
|
||||
// steps shared by Handle and Run.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
|
||||
switch existing.Status {
|
||||
case runtime.StatusStopped, runtime.StatusRemoved:
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
case runtime.StatusRunning:
|
||||
// proceed
|
||||
default:
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
|
||||
}), nil
|
||||
}
|
||||
|
||||
if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
|
||||
if errors.Is(err, ports.ErrContainerNotFound) {
|
||||
return service.handleVanished(ctx, input, opStartedAt, existing), nil
|
||||
}
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
updateNow := service.clock().UTC()
|
||||
err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusStopped,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
// CAS race: a concurrent reconciler / restart already moved the
|
||||
// record. The desired terminal state was reached by another path.
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
|
||||
}
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusStopped
|
||||
stoppedAt := updateNow
|
||||
record.StoppedAt = &stoppedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stopped", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// handleVanished records the success outcome for the case where docker
|
||||
// stop reports the container as already gone. It updates the record to
|
||||
// removed, publishes container_disappeared, and returns success.
|
||||
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
|
||||
updateNow := service.clock().UTC()
|
||||
err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
|
||||
GameID: input.GameID,
|
||||
ExpectedFrom: runtime.StatusRunning,
|
||||
ExpectedContainerID: existing.CurrentContainerID,
|
||||
To: runtime.StatusRemoved,
|
||||
Now: updateNow,
|
||||
})
|
||||
if errors.Is(err, runtime.ErrConflict) {
|
||||
return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
|
||||
}
|
||||
if err != nil && !errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
|
||||
containerID: existing.CurrentContainerID,
|
||||
imageRef: existing.CurrentImageRef,
|
||||
})
|
||||
}
|
||||
|
||||
service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
|
||||
GameID: input.GameID,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
EventType: health.EventTypeContainerDisappeared,
|
||||
OccurredAt: updateNow,
|
||||
Details: emptyHealthDetails(),
|
||||
})
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))
|
||||
|
||||
record := existing
|
||||
record.Status = runtime.StatusRemoved
|
||||
record.CurrentContainerID = ""
|
||||
removedAt := updateNow
|
||||
record.RemovedAt = &removedAt
|
||||
record.LastOpAt = updateNow
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}
|
||||
}
|
||||
|
||||
// recordReplayNoOp records the idempotent replay outcome and returns the
|
||||
// existing record unchanged.
|
||||
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: input.SourceRef,
|
||||
ImageRef: existing.CurrentImageRef,
|
||||
ContainerID: existing.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"container_id", existing.CurrentContainerID,
|
||||
"reason", string(input.Reason),
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: existing,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
ErrorCode: startruntime.ErrorCodeReplayNoOp,
|
||||
}
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure so the runUnderLease
|
||||
// method stays readable.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
containerID string
|
||||
imageRef string
|
||||
}
|
||||
|
||||
// recordFailure records the failure operation_log entry and emits
|
||||
// telemetry. The runtime record stays untouched.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindStop,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"reason", string(fc.input.Reason),
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
// releaseLease releases the per-game lease in a fresh background context
|
||||
// so a canceled request context does not leave the lease pinned for its
|
||||
// TTL.
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortAppend writes one operation_log entry. A failure is logged
|
||||
// and discarded; the durable runtime record (or its absence) remains
|
||||
// the source of truth.
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// bestEffortPublishHealth emits one health event + snapshot upsert.
|
||||
// Failures degrade silently per `rtmanager/README.md §Notification
|
||||
// Contracts`; the runtime record remains the source of truth.
|
||||
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
|
||||
if err := service.healthEvents.Publish(ctx, envelope); err != nil {
|
||||
service.logger.ErrorContext(ctx, "publish health event",
|
||||
"game_id", envelope.GameID,
|
||||
"container_id", envelope.ContainerID,
|
||||
"event_type", string(envelope.EventType),
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// defaultTokenGenerator returns a function that produces 32-byte
|
||||
// base64url-encoded tokens. Mirrors the start service: a degraded
|
||||
// entropy source falls back to a sentinel token so the next TryAcquire
|
||||
// observes a collision rather than a panic.
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
|
||||
// emptyHealthDetails returns the canonical empty-object payload required
|
||||
// by the `container_disappeared` AsyncAPI variant.
|
||||
func emptyHealthDetails() json.RawMessage {
|
||||
return json.RawMessage("{}")
|
||||
}
|
||||
@@ -0,0 +1,537 @@
|
||||
package stopruntime_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/adapters/docker/mocks"
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
)
|
||||
|
||||
// --- test doubles -----------------------------------------------------
|
||||
|
||||
type fakeRuntimeRecords struct {
|
||||
mu sync.Mutex
|
||||
|
||||
stored map[string]runtime.RuntimeRecord
|
||||
getErr error
|
||||
updateStatusErr error
|
||||
|
||||
updates []ports.UpdateStatusInput
|
||||
}
|
||||
|
||||
func newFakeRuntimeRecords() *fakeRuntimeRecords {
|
||||
return &fakeRuntimeRecords{stored: map[string]runtime.RuntimeRecord{}}
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Get(_ context.Context, gameID string) (runtime.RuntimeRecord, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.getErr != nil {
|
||||
return runtime.RuntimeRecord{}, s.getErr
|
||||
}
|
||||
record, ok := s.stored[gameID]
|
||||
if !ok {
|
||||
return runtime.RuntimeRecord{}, runtime.ErrNotFound
|
||||
}
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) Upsert(_ context.Context, _ runtime.RuntimeRecord) error {
|
||||
return errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) UpdateStatus(_ context.Context, input ports.UpdateStatusInput) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.updates = append(s.updates, input)
|
||||
if s.updateStatusErr != nil {
|
||||
return s.updateStatusErr
|
||||
}
|
||||
record, ok := s.stored[input.GameID]
|
||||
if !ok {
|
||||
return runtime.ErrNotFound
|
||||
}
|
||||
if record.Status != input.ExpectedFrom {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
if input.ExpectedContainerID != "" && record.CurrentContainerID != input.ExpectedContainerID {
|
||||
return runtime.ErrConflict
|
||||
}
|
||||
record.Status = input.To
|
||||
record.LastOpAt = input.Now
|
||||
switch input.To {
|
||||
case runtime.StatusStopped:
|
||||
stoppedAt := input.Now
|
||||
record.StoppedAt = &stoppedAt
|
||||
case runtime.StatusRemoved:
|
||||
removedAt := input.Now
|
||||
record.RemovedAt = &removedAt
|
||||
record.CurrentContainerID = ""
|
||||
}
|
||||
s.stored[input.GameID] = record
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) ListByStatus(_ context.Context, _ runtime.Status) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeRuntimeRecords) List(_ context.Context) ([]runtime.RuntimeRecord, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
type fakeOperationLogs struct {
|
||||
mu sync.Mutex
|
||||
|
||||
appendErr error
|
||||
appends []operation.OperationEntry
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) Append(_ context.Context, entry operation.OperationEntry) (int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.appendErr != nil {
|
||||
return 0, s.appendErr
|
||||
}
|
||||
s.appends = append(s.appends, entry)
|
||||
return int64(len(s.appends)), nil
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) ListByGame(_ context.Context, _ string, _ int) ([]operation.OperationEntry, error) {
|
||||
return nil, errors.New("not used in stop tests")
|
||||
}
|
||||
|
||||
func (s *fakeOperationLogs) lastAppend() (operation.OperationEntry, bool) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if len(s.appends) == 0 {
|
||||
return operation.OperationEntry{}, false
|
||||
}
|
||||
return s.appends[len(s.appends)-1], true
|
||||
}
|
||||
|
||||
type fakeLeases struct {
|
||||
acquired bool
|
||||
acquireErr error
|
||||
releaseErr error
|
||||
|
||||
mu sync.Mutex
|
||||
acquires []string
|
||||
releases []string
|
||||
}
|
||||
|
||||
func (l *fakeLeases) TryAcquire(_ context.Context, _, token string, _ time.Duration) (bool, error) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.acquires = append(l.acquires, token)
|
||||
if l.acquireErr != nil {
|
||||
return false, l.acquireErr
|
||||
}
|
||||
return l.acquired, nil
|
||||
}
|
||||
|
||||
func (l *fakeLeases) Release(_ context.Context, _, token string) error {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
l.releases = append(l.releases, token)
|
||||
return l.releaseErr
|
||||
}
|
||||
|
||||
type fakeHealthEvents struct {
|
||||
mu sync.Mutex
|
||||
|
||||
publishErr error
|
||||
envelopes []ports.HealthEventEnvelope
|
||||
}
|
||||
|
||||
func (h *fakeHealthEvents) Publish(_ context.Context, envelope ports.HealthEventEnvelope) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if h.publishErr != nil {
|
||||
return h.publishErr
|
||||
}
|
||||
h.envelopes = append(h.envelopes, envelope)
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- harness ----------------------------------------------------------
|
||||
|
||||
type harness struct {
|
||||
records *fakeRuntimeRecords
|
||||
operationLogs *fakeOperationLogs
|
||||
docker *mocks.MockDockerClient
|
||||
leases *fakeLeases
|
||||
healthEvents *fakeHealthEvents
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *harness {
|
||||
t.Helper()
|
||||
ctrl := gomock.NewController(t)
|
||||
t.Cleanup(ctrl.Finish)
|
||||
|
||||
telemetryRuntime, err := telemetry.NewWithProviders(nil, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &harness{
|
||||
records: newFakeRuntimeRecords(),
|
||||
operationLogs: &fakeOperationLogs{},
|
||||
docker: mocks.NewMockDockerClient(ctrl),
|
||||
leases: &fakeLeases{acquired: true},
|
||||
healthEvents: &fakeHealthEvents{},
|
||||
telemetry: telemetryRuntime,
|
||||
now: time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *harness) build(t *testing.T) *stopruntime.Service {
|
||||
t.Helper()
|
||||
|
||||
containerCfg := config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
}
|
||||
coordinationCfg := config.CoordinationConfig{GameLeaseTTL: time.Minute}
|
||||
|
||||
service, err := stopruntime.NewService(stopruntime.Dependencies{
|
||||
RuntimeRecords: h.records,
|
||||
OperationLogs: h.operationLogs,
|
||||
Docker: h.docker,
|
||||
Leases: h.leases,
|
||||
HealthEvents: h.healthEvents,
|
||||
Container: containerCfg,
|
||||
Coordination: coordinationCfg,
|
||||
Telemetry: h.telemetry,
|
||||
Clock: func() time.Time { return h.now },
|
||||
NewToken: func() string { return "token-A" },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
return service
|
||||
}
|
||||
|
||||
func basicInput() stopruntime.Input {
|
||||
return stopruntime.Input{
|
||||
GameID: "game-1",
|
||||
Reason: stopruntime.StopReasonCancelled,
|
||||
OpSource: operation.OpSourceLobbyStream,
|
||||
SourceRef: "1700000000000-0",
|
||||
}
|
||||
}
|
||||
|
||||
func runningRecord(now time.Time) runtime.RuntimeRecord {
|
||||
startedAt := now.Add(-time.Hour)
|
||||
return runtime.RuntimeRecord{
|
||||
GameID: "game-1",
|
||||
Status: runtime.StatusRunning,
|
||||
CurrentContainerID: "ctr-123",
|
||||
CurrentImageRef: "registry.example.com/galaxy/game:1.4.7",
|
||||
EngineEndpoint: "http://galaxy-game-game-1:8080",
|
||||
StatePath: "/var/lib/galaxy/games/game-1",
|
||||
DockerNetwork: "galaxy-net",
|
||||
StartedAt: &startedAt,
|
||||
LastOpAt: startedAt,
|
||||
CreatedAt: startedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// --- happy path -------------------------------------------------------
|
||||
|
||||
func TestHandleHappyPath(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
|
||||
require.NotNil(t, result.Record.StoppedAt)
|
||||
assert.Equal(t, h.now, *result.Record.StoppedAt)
|
||||
assert.Equal(t, h.now, result.Record.LastOpAt)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusRunning, h.records.updates[0].ExpectedFrom)
|
||||
assert.Equal(t, runtime.StatusStopped, h.records.updates[0].To)
|
||||
assert.Equal(t, "ctr-123", h.records.updates[0].ExpectedContainerID)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OpKindStop, last.OpKind)
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
|
||||
assert.Empty(t, h.healthEvents.envelopes)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.acquires)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
}
|
||||
|
||||
// --- replay ----------------------------------------------------------
|
||||
|
||||
func TestHandleReplayNoOpForStoppedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
stoppedRecord := runningRecord(h.now)
|
||||
stoppedRecord.Status = runtime.StatusStopped
|
||||
stoppedAt := h.now.Add(-time.Minute)
|
||||
stoppedRecord.StoppedAt = &stoppedAt
|
||||
h.records.stored["game-1"] = stoppedRecord
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusStopped, result.Record.Status)
|
||||
|
||||
assert.Empty(t, h.records.updates)
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, last.ErrorCode)
|
||||
assert.Equal(t, []string{"token-A"}, h.leases.releases)
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpForRemovedRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
removed := runningRecord(h.now)
|
||||
removed.Status = runtime.StatusRemoved
|
||||
removed.CurrentContainerID = ""
|
||||
removedAt := h.now.Add(-time.Minute)
|
||||
removed.RemovedAt = &removedAt
|
||||
h.records.stored["game-1"] = removed
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- vanished container ----------------------------------------------
|
||||
|
||||
func TestHandleVanishedContainerMarksRemoved(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, result.ErrorCode)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
assert.Empty(t, result.Record.CurrentContainerID)
|
||||
|
||||
require.Len(t, h.records.updates, 1)
|
||||
assert.Equal(t, runtime.StatusRemoved, h.records.updates[0].To)
|
||||
|
||||
require.Len(t, h.healthEvents.envelopes, 1)
|
||||
assert.Equal(t, health.EventTypeContainerDisappeared, h.healthEvents.envelopes[0].EventType)
|
||||
|
||||
require.Len(t, h.operationLogs.appends, 1)
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeSuccess, last.Outcome)
|
||||
assert.Empty(t, last.ErrorCode)
|
||||
}
|
||||
|
||||
// --- failure paths ---------------------------------------------------
|
||||
|
||||
func TestHandleNotFoundForMissingRecord(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeNotFound, result.ErrorCode)
|
||||
assert.Empty(t, h.healthEvents.envelopes)
|
||||
assert.Empty(t, h.records.updates)
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnDockerError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(errors.New("docker daemon timeout"))
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
|
||||
last, _ := h.operationLogs.lastAppend()
|
||||
assert.Equal(t, operation.OutcomeFailure, last.Outcome)
|
||||
assert.Equal(t, "ctr-123", last.ContainerID)
|
||||
assert.Empty(t, h.records.updates, "no record mutation on docker stop failure")
|
||||
}
|
||||
|
||||
func TestHandleReplayNoOpOnUpdateStatusConflict(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.records.updateStatusErr = runtime.ErrConflict
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeReplayNoOp, result.ErrorCode)
|
||||
}
|
||||
|
||||
func TestHandleInternalErrorOnUpdateStatusGenericError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.records.updateStatusErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeInternal, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- conflicts -------------------------------------------------------
|
||||
|
||||
func TestHandleConflictWhenLeaseBusy(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquired = false
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeConflict, result.ErrorCode)
|
||||
|
||||
assert.Empty(t, h.leases.releases, "release must not run when acquire returned false")
|
||||
}
|
||||
|
||||
func TestHandleServiceUnavailableOnLeaseError(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.leases.acquireErr = errors.New("redis timeout")
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeFailure, result.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeServiceUnavailable, result.ErrorCode)
|
||||
}
|
||||
|
||||
// --- input validation ------------------------------------------------
|
||||
|
||||
func TestHandleRejectsInvalidInput(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
service := h.build(t)
|
||||
|
||||
cases := []stopruntime.Input{
|
||||
{GameID: "", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: "", OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: stopruntime.StopReason("bogus"), OpSource: operation.OpSourceLobbyStream},
|
||||
{GameID: "g", Reason: stopruntime.StopReasonCancelled, OpSource: operation.OpSource("bogus")},
|
||||
}
|
||||
for _, input := range cases {
|
||||
result, err := service.Handle(context.Background(), input)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, startruntime.ErrorCodeInvalidRequest, result.ErrorCode)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Run path (no-lease) ---------------------------------------------
|
||||
|
||||
func TestRunSkipsLease(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.leases.acquired = false // would block Handle; Run must ignore
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Run(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Empty(t, h.leases.acquires, "Run must not touch the lease store")
|
||||
assert.Empty(t, h.leases.releases)
|
||||
}
|
||||
|
||||
// --- best-effort degradation ----------------------------------------
|
||||
|
||||
func TestHandleSurvivesOperationLogFailure(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.operationLogs.appendErr = errors.New("postgres down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(nil)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
}
|
||||
|
||||
func TestHandleSurvivesHealthPublishFailureOnVanished(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.records.stored["game-1"] = runningRecord(h.now)
|
||||
h.healthEvents.publishErr = errors.New("redis down")
|
||||
|
||||
h.docker.EXPECT().Stop(gomock.Any(), "ctr-123", 30*time.Second).Return(ports.ErrContainerNotFound)
|
||||
|
||||
service := h.build(t)
|
||||
result, err := service.Handle(context.Background(), basicInput())
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, operation.OutcomeSuccess, result.Outcome)
|
||||
assert.Equal(t, runtime.StatusRemoved, result.Record.Status)
|
||||
}
|
||||
|
||||
// --- constructor -----------------------------------------------------
|
||||
|
||||
func TestNewServiceRejectsMissingDependencies(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
deps := stopruntime.Dependencies{
|
||||
Container: config.ContainerConfig{
|
||||
DefaultCPUQuota: 1.0,
|
||||
DefaultMemory: "512m",
|
||||
DefaultPIDsLimit: 512,
|
||||
StopTimeout: 30 * time.Second,
|
||||
Retention: 30 * 24 * time.Hour,
|
||||
EngineStateMountPath: "/var/lib/galaxy-game",
|
||||
EngineStateEnvName: "GAME_STATE_PATH",
|
||||
GameStateDirMode: 0o750,
|
||||
GameStateRoot: "/var/lib/galaxy/games",
|
||||
},
|
||||
Coordination: config.CoordinationConfig{GameLeaseTTL: time.Minute},
|
||||
Telemetry: h.telemetry,
|
||||
}
|
||||
_, err := stopruntime.NewService(deps)
|
||||
require.Error(t, err)
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package stopruntime
|
||||
|
||||
import "fmt"
|
||||
|
||||
// StopReason classifies why a caller is asking Runtime Manager to stop a
|
||||
// game container. The enum is part of the `runtime:stop_jobs` envelope
|
||||
// produced by Game Lobby and the body of the `POST
|
||||
// /api/v1/internal/runtimes/{game_id}/stop` REST endpoint, and mirrors
|
||||
// the AsyncAPI contract frozen in
|
||||
// `rtmanager/api/runtime-jobs-asyncapi.yaml`.
|
||||
//
|
||||
// The vocabulary is shared with `lobby/internal/ports/runtimemanager.go`;
|
||||
// the two declarations stay byte-identical and adding a new value
|
||||
// requires a coordinated contract bump on both sides.
|
||||
type StopReason string
|
||||
|
||||
// StopReason enum values. Adding a new value is a contract change that
|
||||
// touches the AsyncAPI spec, the Lobby producer, and every Runtime
|
||||
// Manager consumer.
|
||||
const (
|
||||
// StopReasonOrphanCleanup releases a container whose post-start
|
||||
// metadata persistence failed in Lobby.
|
||||
StopReasonOrphanCleanup StopReason = "orphan_cleanup"
|
||||
|
||||
// StopReasonCancelled covers user-lifecycle cascade and explicit
|
||||
// cancel paths for in-flight games.
|
||||
StopReasonCancelled StopReason = "cancelled"
|
||||
|
||||
// StopReasonFinished is reserved for engine-driven game finish flows.
|
||||
StopReasonFinished StopReason = "finished"
|
||||
|
||||
// StopReasonAdminRequest is reserved for admin-initiated stop paths.
|
||||
StopReasonAdminRequest StopReason = "admin_request"
|
||||
|
||||
// StopReasonTimeout is reserved for timeout-driven stop paths.
|
||||
StopReasonTimeout StopReason = "timeout"
|
||||
)
|
||||
|
||||
// IsKnown reports whether reason belongs to the frozen stop-reason
|
||||
// vocabulary.
|
||||
func (reason StopReason) IsKnown() bool {
|
||||
switch reason {
|
||||
case StopReasonOrphanCleanup,
|
||||
StopReasonCancelled,
|
||||
StopReasonFinished,
|
||||
StopReasonAdminRequest,
|
||||
StopReasonTimeout:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AllStopReasons returns the frozen list of every stop-reason value. The
|
||||
// slice order is stable across calls and matches the AsyncAPI enum order.
|
||||
func AllStopReasons() []StopReason {
|
||||
return []StopReason{
|
||||
StopReasonOrphanCleanup,
|
||||
StopReasonCancelled,
|
||||
StopReasonFinished,
|
||||
StopReasonAdminRequest,
|
||||
StopReasonTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// String returns reason as its stored enum value. Useful in log fields
|
||||
// and telemetry attributes.
|
||||
func (reason StopReason) String() string {
|
||||
return string(reason)
|
||||
}
|
||||
|
||||
// Validate reports whether reason carries one of the five values fixed
|
||||
// by the AsyncAPI contract.
|
||||
func (reason StopReason) Validate() error {
|
||||
if reason == "" {
|
||||
return fmt.Errorf("stop reason must not be empty")
|
||||
}
|
||||
if !reason.IsKnown() {
|
||||
return fmt.Errorf("stop reason %q is unsupported", reason)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,651 @@
|
||||
// Package telemetry provides lightweight OpenTelemetry helpers and
|
||||
// low-cardinality Runtime Manager instruments used by the runnable
|
||||
// skeleton. Later stages emit into the instruments declared here without
|
||||
// touching this package.
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
|
||||
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
oteltrace "go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
const meterName = "galaxy/rtmanager"
|
||||
|
||||
const (
|
||||
defaultServiceName = "galaxy-rtmanager"
|
||||
|
||||
processExporterNone = "none"
|
||||
processExporterOTLP = "otlp"
|
||||
processProtocolHTTPProtobuf = "http/protobuf"
|
||||
processProtocolGRPC = "grpc"
|
||||
)
|
||||
|
||||
// ProcessConfig configures the process-wide OpenTelemetry runtime.
|
||||
type ProcessConfig struct {
|
||||
// ServiceName overrides the default OpenTelemetry service name.
|
||||
ServiceName string
|
||||
|
||||
// TracesExporter selects the external traces exporter. Supported values
|
||||
// are `none` and `otlp`.
|
||||
TracesExporter string
|
||||
|
||||
// MetricsExporter selects the external metrics exporter. Supported
|
||||
// values are `none` and `otlp`.
|
||||
MetricsExporter string
|
||||
|
||||
// TracesProtocol selects the OTLP traces protocol when TracesExporter is
|
||||
// `otlp`.
|
||||
TracesProtocol string
|
||||
|
||||
// MetricsProtocol selects the OTLP metrics protocol when
|
||||
// MetricsExporter is `otlp`.
|
||||
MetricsProtocol string
|
||||
|
||||
// StdoutTracesEnabled enables the additional stdout trace exporter used
|
||||
// for local development and debugging.
|
||||
StdoutTracesEnabled bool
|
||||
|
||||
// StdoutMetricsEnabled enables the additional stdout metric exporter
|
||||
// used for local development and debugging.
|
||||
StdoutMetricsEnabled bool
|
||||
}
|
||||
|
||||
// Validate reports whether cfg contains a supported OpenTelemetry exporter
|
||||
// configuration.
|
||||
func (cfg ProcessConfig) Validate() error {
|
||||
switch cfg.TracesExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported traces exporter %q", cfg.TracesExporter)
|
||||
}
|
||||
|
||||
switch cfg.MetricsExporter {
|
||||
case processExporterNone, processExporterOTLP:
|
||||
default:
|
||||
return fmt.Errorf("unsupported metrics exporter %q", cfg.MetricsExporter)
|
||||
}
|
||||
|
||||
if cfg.TracesProtocol != "" && cfg.TracesProtocol != processProtocolHTTPProtobuf && cfg.TracesProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP traces protocol %q", cfg.TracesProtocol)
|
||||
}
|
||||
if cfg.MetricsProtocol != "" && cfg.MetricsProtocol != processProtocolHTTPProtobuf && cfg.MetricsProtocol != processProtocolGRPC {
|
||||
return fmt.Errorf("unsupported OTLP metrics protocol %q", cfg.MetricsProtocol)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Runtime owns the Runtime Manager OpenTelemetry providers and the
|
||||
// low-cardinality custom instruments listed in `rtmanager/README.md`
|
||||
// §Observability.
|
||||
type Runtime struct {
|
||||
tracerProvider oteltrace.TracerProvider
|
||||
meterProvider metric.MeterProvider
|
||||
meter metric.Meter
|
||||
|
||||
shutdownMu sync.Mutex
|
||||
shutdownDone bool
|
||||
shutdownErr error
|
||||
shutdownFns []func(context.Context) error
|
||||
|
||||
internalHTTPRequests metric.Int64Counter
|
||||
internalHTTPDuration metric.Float64Histogram
|
||||
|
||||
startOutcomes metric.Int64Counter
|
||||
stopOutcomes metric.Int64Counter
|
||||
restartOutcomes metric.Int64Counter
|
||||
patchOutcomes metric.Int64Counter
|
||||
cleanupOutcomes metric.Int64Counter
|
||||
healthEvents metric.Int64Counter
|
||||
reconcileDrift metric.Int64Counter
|
||||
notificationIntents metric.Int64Counter
|
||||
dockerOpLatency metric.Float64Histogram
|
||||
leaseAcquireLatency metric.Float64Histogram
|
||||
|
||||
runtimeRecordsByStatus metric.Int64ObservableGauge
|
||||
|
||||
gaugeMu sync.Mutex
|
||||
gaugeRegistration metric.Registration
|
||||
}
|
||||
|
||||
// NewWithProviders constructs a telemetry runtime around explicitly supplied
|
||||
// meterProvider and tracerProvider values.
|
||||
func NewWithProviders(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider) (*Runtime, error) {
|
||||
if meterProvider == nil {
|
||||
meterProvider = otel.GetMeterProvider()
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
tracerProvider = otel.GetTracerProvider()
|
||||
}
|
||||
if meterProvider == nil {
|
||||
return nil, errors.New("new rtmanager telemetry runtime: nil meter provider")
|
||||
}
|
||||
if tracerProvider == nil {
|
||||
return nil, errors.New("new rtmanager telemetry runtime: nil tracer provider")
|
||||
}
|
||||
|
||||
return buildRuntime(meterProvider, tracerProvider, nil)
|
||||
}
|
||||
|
||||
// NewProcess constructs the process-wide Runtime Manager OpenTelemetry
|
||||
// runtime from cfg, installs the resulting providers globally, and
|
||||
// returns the runtime.
|
||||
func NewProcess(ctx context.Context, cfg ProcessConfig, logger *slog.Logger) (*Runtime, error) {
|
||||
if ctx == nil {
|
||||
return nil, errors.New("new rtmanager telemetry process: nil context")
|
||||
}
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager telemetry process: %w", err)
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
serviceName := strings.TrimSpace(cfg.ServiceName)
|
||||
if serviceName == "" {
|
||||
serviceName = defaultServiceName
|
||||
}
|
||||
|
||||
res := resource.NewSchemaless(attribute.String("service.name", serviceName))
|
||||
|
||||
tracerProvider, err := newTracerProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager telemetry process: tracer provider: %w", err)
|
||||
}
|
||||
meterProvider, err := newMeterProvider(ctx, res, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager telemetry process: meter provider: %w", err)
|
||||
}
|
||||
|
||||
otel.SetTracerProvider(tracerProvider)
|
||||
otel.SetMeterProvider(meterProvider)
|
||||
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||
propagation.TraceContext{},
|
||||
propagation.Baggage{},
|
||||
))
|
||||
|
||||
runtime, err := buildRuntime(meterProvider, tracerProvider, []func(context.Context) error{
|
||||
meterProvider.Shutdown,
|
||||
tracerProvider.Shutdown,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new rtmanager telemetry process: runtime: %w", err)
|
||||
}
|
||||
|
||||
logger.Info("rtmanager telemetry configured",
|
||||
"service_name", serviceName,
|
||||
"traces_exporter", cfg.TracesExporter,
|
||||
"metrics_exporter", cfg.MetricsExporter,
|
||||
)
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
// TracerProvider returns the runtime tracer provider.
|
||||
func (runtime *Runtime) TracerProvider() oteltrace.TracerProvider {
|
||||
if runtime == nil || runtime.tracerProvider == nil {
|
||||
return otel.GetTracerProvider()
|
||||
}
|
||||
|
||||
return runtime.tracerProvider
|
||||
}
|
||||
|
||||
// MeterProvider returns the runtime meter provider.
|
||||
func (runtime *Runtime) MeterProvider() metric.MeterProvider {
|
||||
if runtime == nil || runtime.meterProvider == nil {
|
||||
return otel.GetMeterProvider()
|
||||
}
|
||||
|
||||
return runtime.meterProvider
|
||||
}
|
||||
|
||||
// Shutdown flushes and stops the configured telemetry providers. Shutdown
|
||||
// is idempotent.
|
||||
func (runtime *Runtime) Shutdown(ctx context.Context) error {
|
||||
if runtime == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
if runtime.shutdownDone {
|
||||
err := runtime.shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
return err
|
||||
}
|
||||
runtime.shutdownDone = true
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
runtime.gaugeMu.Unlock()
|
||||
|
||||
var shutdownErr error
|
||||
for index := len(runtime.shutdownFns) - 1; index >= 0; index-- {
|
||||
shutdownErr = errors.Join(shutdownErr, runtime.shutdownFns[index](ctx))
|
||||
}
|
||||
|
||||
runtime.shutdownMu.Lock()
|
||||
runtime.shutdownErr = shutdownErr
|
||||
runtime.shutdownMu.Unlock()
|
||||
|
||||
return shutdownErr
|
||||
}
|
||||
|
||||
// RecordInternalHTTPRequest records one internal HTTP request outcome.
|
||||
func (runtime *Runtime) RecordInternalHTTPRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
runtime.internalHTTPRequests.Add(normalizeContext(ctx), 1, options)
|
||||
runtime.internalHTTPDuration.Record(normalizeContext(ctx), duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordStartOutcome records one terminal outcome of the start operation.
|
||||
// outcome is `success` or `failure`; errorCode is `replay_no_op` or one of
|
||||
// the stable failure codes from `rtmanager/README.md` §Error Model;
|
||||
// opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
|
||||
func (runtime *Runtime) RecordStartOutcome(ctx context.Context, outcome, errorCode, opSource string) {
|
||||
if runtime == nil || runtime.startOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.startOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
attribute.String("op_source", opSource),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordStopOutcome records one terminal outcome of the stop operation.
|
||||
// reason is the value carried on `runtime:stop_jobs` or the matching REST
|
||||
// reason; opSource is `lobby_stream`, `gm_rest`, or `admin_rest`.
|
||||
func (runtime *Runtime) RecordStopOutcome(ctx context.Context, outcome, reason, opSource string) {
|
||||
if runtime == nil || runtime.stopOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.stopOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("reason", reason),
|
||||
attribute.String("op_source", opSource),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordRestartOutcome records one terminal outcome of the restart
|
||||
// operation.
|
||||
func (runtime *Runtime) RecordRestartOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.restartOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.restartOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordPatchOutcome records one terminal outcome of the patch operation.
|
||||
func (runtime *Runtime) RecordPatchOutcome(ctx context.Context, outcome, errorCode string) {
|
||||
if runtime == nil || runtime.patchOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.patchOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("error_code", errorCode),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordCleanupOutcome records one terminal outcome of the cleanup
|
||||
// operation. opSource is `auto_ttl` for the periodic cleanup worker and
|
||||
// `admin_rest` for explicit administrative removal.
|
||||
func (runtime *Runtime) RecordCleanupOutcome(ctx context.Context, outcome, opSource string) {
|
||||
if runtime == nil || runtime.cleanupOutcomes == nil {
|
||||
return
|
||||
}
|
||||
runtime.cleanupOutcomes.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("outcome", outcome),
|
||||
attribute.String("op_source", opSource),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordHealthEvent records one technical runtime event published on
|
||||
// `runtime:health_events`. eventType comes from the frozen vocabulary in
|
||||
// `rtmanager/README.md` §Async Stream Contracts.
|
||||
func (runtime *Runtime) RecordHealthEvent(ctx context.Context, eventType string) {
|
||||
if runtime == nil || runtime.healthEvents == nil {
|
||||
return
|
||||
}
|
||||
runtime.healthEvents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("event_type", eventType),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordReconcileDrift records one drift outcome from the reconciler. kind
|
||||
// is `adopt`, `dispose`, or `observed_exited`.
|
||||
func (runtime *Runtime) RecordReconcileDrift(ctx context.Context, kind string) {
|
||||
if runtime == nil || runtime.reconcileDrift == nil {
|
||||
return
|
||||
}
|
||||
runtime.reconcileDrift.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("kind", kind),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordNotificationIntent records one admin-only notification intent
|
||||
// publish attempt. notificationType is `runtime.image_pull_failed`,
|
||||
// `runtime.container_start_failed`, or `runtime.start_config_invalid`.
|
||||
func (runtime *Runtime) RecordNotificationIntent(ctx context.Context, notificationType string) {
|
||||
if runtime == nil || runtime.notificationIntents == nil {
|
||||
return
|
||||
}
|
||||
runtime.notificationIntents.Add(normalizeContext(ctx), 1, metric.WithAttributes(
|
||||
attribute.String("notification_type", notificationType),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordDockerOpLatency records the wall-clock duration of one Docker SDK
|
||||
// call. op is one of `pull`, `create`, `start`, `stop`, `rm`, `inspect`,
|
||||
// `events`.
|
||||
func (runtime *Runtime) RecordDockerOpLatency(ctx context.Context, op string, duration time.Duration) {
|
||||
if runtime == nil || runtime.dockerOpLatency == nil {
|
||||
return
|
||||
}
|
||||
runtime.dockerOpLatency.Record(normalizeContext(ctx), duration.Seconds()*1000, metric.WithAttributes(
|
||||
attribute.String("op", op),
|
||||
))
|
||||
}
|
||||
|
||||
// RecordLeaseAcquireLatency records the wall-clock latency of one
|
||||
// per-game Redis lease acquisition.
|
||||
func (runtime *Runtime) RecordLeaseAcquireLatency(ctx context.Context, duration time.Duration) {
|
||||
if runtime == nil || runtime.leaseAcquireLatency == nil {
|
||||
return
|
||||
}
|
||||
runtime.leaseAcquireLatency.Record(normalizeContext(ctx), duration.Seconds()*1000)
|
||||
}
|
||||
|
||||
// RuntimeRecordsByStatusProbe reports the number of runtime_records rows
|
||||
// per status. The production probe wraps the runtime record store; tests
|
||||
// may pass a stub.
|
||||
type RuntimeRecordsByStatusProbe interface {
|
||||
CountByStatus(ctx context.Context) (map[string]int, error)
|
||||
}
|
||||
|
||||
// GaugeDependencies groups the collaborators required by RegisterGauges.
|
||||
type GaugeDependencies struct {
|
||||
// RuntimeRecordsByStatus probes the per-status row count for
|
||||
// `rtmanager.runtime_records_by_status`.
|
||||
RuntimeRecordsByStatus RuntimeRecordsByStatusProbe
|
||||
|
||||
// Logger records non-fatal probe errors. Defaults to slog.Default
|
||||
// when nil.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// RegisterGauges installs the observable-gauge callback that reports
|
||||
// `rtmanager.runtime_records_by_status`. It is safe to call once per
|
||||
// Runtime; a second call replaces the previous registration. The runtime
|
||||
// keeps no strong reference to deps beyond the callback closure.
|
||||
//
|
||||
// The wiring layer registers the gauge once the persistence adapters
|
||||
// are constructed.
|
||||
func (runtime *Runtime) RegisterGauges(deps GaugeDependencies) error {
|
||||
if runtime == nil {
|
||||
return errors.New("register rtmanager gauges: nil runtime")
|
||||
}
|
||||
if deps.RuntimeRecordsByStatus == nil {
|
||||
return errors.New("register rtmanager gauges: nil runtime records probe")
|
||||
}
|
||||
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
runtime.gaugeMu.Lock()
|
||||
defer runtime.gaugeMu.Unlock()
|
||||
|
||||
if runtime.gaugeRegistration != nil {
|
||||
_ = runtime.gaugeRegistration.Unregister()
|
||||
runtime.gaugeRegistration = nil
|
||||
}
|
||||
|
||||
callback := func(ctx context.Context, observer metric.Observer) error {
|
||||
counts, err := deps.RuntimeRecordsByStatus.CountByStatus(ctx)
|
||||
if err != nil {
|
||||
logger.WarnContext(ctx, "runtime records probe failed",
|
||||
"err", err.Error(),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
for status, count := range counts {
|
||||
observer.ObserveInt64(runtime.runtimeRecordsByStatus, int64(count), metric.WithAttributes(
|
||||
attribute.String("status", status),
|
||||
))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
registration, err := runtime.meter.RegisterCallback(callback, runtime.runtimeRecordsByStatus)
|
||||
if err != nil {
|
||||
return fmt.Errorf("register rtmanager gauges: %w", err)
|
||||
}
|
||||
runtime.gaugeRegistration = registration
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildRuntime(meterProvider metric.MeterProvider, tracerProvider oteltrace.TracerProvider, shutdownFns []func(context.Context) error) (*Runtime, error) {
|
||||
meter := meterProvider.Meter(meterName)
|
||||
runtime := &Runtime{
|
||||
tracerProvider: tracerProvider,
|
||||
meterProvider: meterProvider,
|
||||
meter: meter,
|
||||
shutdownFns: append([]func(context.Context) error(nil), shutdownFns...),
|
||||
}
|
||||
|
||||
internalHTTPRequests, err := meter.Int64Counter("rtmanager.internal_http.requests")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.requests: %w", err)
|
||||
}
|
||||
internalHTTPDuration, err := meter.Float64Histogram("rtmanager.internal_http.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build rtmanager telemetry runtime: internal_http.duration: %w", err)
|
||||
}
|
||||
runtime.internalHTTPRequests = internalHTTPRequests
|
||||
runtime.internalHTTPDuration = internalHTTPDuration
|
||||
|
||||
if err := registerCounters(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := registerHistograms(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := registerObservableGauges(meter, runtime); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return runtime, nil
|
||||
}
|
||||
|
||||
func registerCounters(meter metric.Meter, runtime *Runtime) error {
|
||||
specs := []struct {
|
||||
name string
|
||||
target *metric.Int64Counter
|
||||
}{
|
||||
{"rtmanager.start_outcomes", &runtime.startOutcomes},
|
||||
{"rtmanager.stop_outcomes", &runtime.stopOutcomes},
|
||||
{"rtmanager.restart_outcomes", &runtime.restartOutcomes},
|
||||
{"rtmanager.patch_outcomes", &runtime.patchOutcomes},
|
||||
{"rtmanager.cleanup_outcomes", &runtime.cleanupOutcomes},
|
||||
{"rtmanager.health_events", &runtime.healthEvents},
|
||||
{"rtmanager.reconcile_drift", &runtime.reconcileDrift},
|
||||
{"rtmanager.notification_intents", &runtime.notificationIntents},
|
||||
}
|
||||
for _, spec := range specs {
|
||||
counter, err := meter.Int64Counter(spec.name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = counter
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerHistograms(meter metric.Meter, runtime *Runtime) error {
|
||||
specs := []struct {
|
||||
name string
|
||||
unit string
|
||||
target *metric.Float64Histogram
|
||||
}{
|
||||
{"rtmanager.docker_op_latency", "ms", &runtime.dockerOpLatency},
|
||||
{"rtmanager.lease_acquire_latency", "ms", &runtime.leaseAcquireLatency},
|
||||
}
|
||||
for _, spec := range specs {
|
||||
options := []metric.Float64HistogramOption{}
|
||||
if spec.unit != "" {
|
||||
options = append(options, metric.WithUnit(spec.unit))
|
||||
}
|
||||
histogram, err := meter.Float64Histogram(spec.name, options...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build rtmanager telemetry runtime: %s: %w", spec.name, err)
|
||||
}
|
||||
*spec.target = histogram
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerObservableGauges(meter metric.Meter, runtime *Runtime) error {
|
||||
gauge, err := meter.Int64ObservableGauge("rtmanager.runtime_records_by_status")
|
||||
if err != nil {
|
||||
return fmt.Errorf("build rtmanager telemetry runtime: runtime_records_by_status: %w", err)
|
||||
}
|
||||
runtime.runtimeRecordsByStatus = gauge
|
||||
return nil
|
||||
}
|
||||
|
||||
func newTracerProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdktrace.TracerProvider, error) {
|
||||
options := []sdktrace.TracerProviderOption{
|
||||
sdktrace.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := traceExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
if cfg.StdoutTracesEnabled {
|
||||
exporter, err := stdouttrace.New(stdouttrace.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout traces exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdktrace.WithBatcher(exporter))
|
||||
}
|
||||
|
||||
return sdktrace.NewTracerProvider(options...), nil
|
||||
}
|
||||
|
||||
func newMeterProvider(ctx context.Context, res *resource.Resource, cfg ProcessConfig) (*sdkmetric.MeterProvider, error) {
|
||||
options := []sdkmetric.Option{
|
||||
sdkmetric.WithResource(res),
|
||||
}
|
||||
|
||||
if exporter, err := metricExporter(ctx, cfg); err != nil {
|
||||
return nil, err
|
||||
} else if exporter != nil {
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
if cfg.StdoutMetricsEnabled {
|
||||
exporter, err := stdoutmetric.New(stdoutmetric.WithWriter(os.Stdout))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stdout metrics exporter: %w", err)
|
||||
}
|
||||
options = append(options, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(exporter)))
|
||||
}
|
||||
|
||||
return sdkmetric.NewMeterProvider(options...), nil
|
||||
}
|
||||
|
||||
func traceExporter(ctx context.Context, cfg ProcessConfig) (sdktrace.SpanExporter, error) {
|
||||
if cfg.TracesExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.TracesProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlptracegrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlptracehttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http traces exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func metricExporter(ctx context.Context, cfg ProcessConfig) (sdkmetric.Exporter, error) {
|
||||
if cfg.MetricsExporter != processExporterOTLP {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch normalizeProtocol(cfg.MetricsProtocol) {
|
||||
case processProtocolGRPC:
|
||||
exporter, err := otlpmetricgrpc.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp grpc metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
default:
|
||||
exporter, err := otlpmetrichttp.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("otlp http metrics exporter: %w", err)
|
||||
}
|
||||
return exporter, nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeProtocol(value string) string {
|
||||
switch strings.TrimSpace(value) {
|
||||
case processProtocolGRPC:
|
||||
return processProtocolGRPC
|
||||
default:
|
||||
return processProtocolHTTPProtobuf
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeContext(ctx context.Context) context.Context {
|
||||
if ctx == nil {
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
return ctx
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user