feat: runtime manager
This commit is contained in:
@@ -0,0 +1,200 @@
|
||||
//go:build integration
|
||||
|
||||
package integration_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"galaxy/notificationintent"
|
||||
"galaxy/rtmanager/integration/harness"
|
||||
"galaxy/rtmanager/internal/domain/health"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
|
||||
dockercontainer "github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/network"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestHealth_ContainerDisappearedAndAdopt verifies the two
|
||||
// drift-detection paths. The Docker events listener emits
|
||||
// `container_disappeared` when a tracked container is destroyed
|
||||
// outside RTM, and the reconciler adopts a fresh container labelled
|
||||
// `com.galaxy.owner=rtmanager` that has no PG row.
|
||||
//
|
||||
// `runtime_records.status=removed` is terminal per
|
||||
// `runtime.AllowedTransitions`; the adoption path therefore uses a
|
||||
// **fresh** game_id rather than re-adopting the disposed one. That
|
||||
// matches the documented contract: reconciler adopts containers
|
||||
// labelled `com.galaxy.owner=rtmanager` for which no PG row exists.
|
||||
func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{
|
||||
ReconcileInterval: 500 * time.Millisecond,
|
||||
})
|
||||
|
||||
// Step 1 — bring a game to running through the start consumer.
|
||||
disposalGameID := harness.IDFromTestName(t) + "-d"
|
||||
harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef)
|
||||
startResult := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess),
|
||||
30*time.Second,
|
||||
)
|
||||
originalContainerID := startResult.ContainerID
|
||||
require.NotEmpty(t, originalContainerID)
|
||||
|
||||
// Step 2 — externally remove the container; the events listener
|
||||
// should observe the destroy and publish `container_disappeared`.
|
||||
removeContainer(t, env, originalContainerID)
|
||||
disappeared := harness.WaitForHealthEvent(t, env,
|
||||
harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)),
|
||||
20*time.Second,
|
||||
)
|
||||
assert.Equal(t, originalContainerID, disappeared.ContainerID)
|
||||
|
||||
// The reconciler also marks the runtime record as removed within
|
||||
// one or two ticks (`reconcile_dispose`).
|
||||
harness.EventuallyRuntimeRecord(t, env, disposalGameID,
|
||||
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved },
|
||||
15*time.Second,
|
||||
)
|
||||
harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second)
|
||||
|
||||
// Step 3 — bring up an adoption candidate for an unseen game id
|
||||
// by hand. The reconciler must label-match it, find no record,
|
||||
// and insert one with status=running.
|
||||
adoptionGameID := harness.IDFromTestName(t) + "-a"
|
||||
manualContainerID := runManualEngineContainer(t, env, adoptionGameID)
|
||||
t.Logf("manual container id=%s", manualContainerID)
|
||||
|
||||
adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID,
|
||||
func(r runtime.RuntimeRecord) bool {
|
||||
return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID
|
||||
},
|
||||
20*time.Second,
|
||||
)
|
||||
assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef)
|
||||
|
||||
adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome)
|
||||
assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource)
|
||||
assert.Equal(t, manualContainerID, adoptEntry.ContainerID)
|
||||
}
|
||||
|
||||
// TestNotification_ImagePullFailed drives Runtime Manager with a
|
||||
// start envelope pointing at an unresolvable image reference. The
|
||||
// start service must surface the failure on `runtime:job_results` and
|
||||
// publish a `runtime.image_pull_failed` admin notification on
|
||||
// `notification:intents`.
|
||||
func TestNotification_ImagePullFailed(t *testing.T) {
|
||||
env := harness.NewEnv(t, harness.EnvOptions{})
|
||||
gameID := harness.IDFromTestName(t)
|
||||
|
||||
const missingImage = "galaxy/integration-missing:0.0.0"
|
||||
harness.XAddStartJob(t, env, gameID, missingImage)
|
||||
|
||||
// Job result publishes a failure with the stable image_pull_failed
|
||||
// code.
|
||||
jobResult := harness.WaitForJobResult(t, env,
|
||||
harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure),
|
||||
60*time.Second,
|
||||
)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode)
|
||||
assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id")
|
||||
assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint")
|
||||
assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message")
|
||||
|
||||
// Notification stream carries the matching admin-only intent.
|
||||
intent := harness.WaitForNotificationIntent(t, env,
|
||||
func(entry harness.NotificationIntentEntry) bool {
|
||||
if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) {
|
||||
return false
|
||||
}
|
||||
payloadGameID, _ := entry.Payload["game_id"].(string)
|
||||
return payloadGameID == gameID
|
||||
},
|
||||
30*time.Second,
|
||||
)
|
||||
require.NotNil(t, intent.Payload, "notification intent must carry a payload")
|
||||
assert.Equal(t, gameID, intent.Payload["game_id"])
|
||||
assert.Equal(t, missingImage, intent.Payload["image_ref"])
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"])
|
||||
|
||||
// PG state: no running record was installed; operation_log
|
||||
// captures one failed start with the stable error code.
|
||||
_, err := harness.RuntimeRecord(t, env, gameID)
|
||||
if err == nil {
|
||||
// If an entry was upserted (rollback gap), it must not be
|
||||
// running.
|
||||
record := harness.MustRuntimeRecord(t, env, gameID)
|
||||
assert.NotEqual(t, runtime.StatusRunning, record.Status,
|
||||
"failed image pull must not leave a running record behind")
|
||||
}
|
||||
|
||||
failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
|
||||
assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome)
|
||||
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode)
|
||||
}
|
||||
|
||||
// removeContainer terminates and removes the container behind RTM's
|
||||
// back. Force=true is required because the engine has not received a
|
||||
// SIGTERM and stop signal handling is engine-internal.
|
||||
func removeContainer(t *testing.T, env *harness.Env, containerID string) {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true}))
|
||||
}
|
||||
|
||||
// runManualEngineContainer bypasses RTM and starts an engine container
|
||||
// directly through the Docker SDK. The container carries every label
|
||||
// the reconciler reads at adopt time (`com.galaxy.owner`,
|
||||
// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`,
|
||||
// `com.galaxy.started_at_ms`) plus the per-game hostname so the
|
||||
// computed `engine_endpoint` matches what `rtmanager` would have
|
||||
// written.
|
||||
func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
hostname := "galaxy-game-" + gameID
|
||||
cfg := &dockercontainer.Config{
|
||||
Image: env.EngineImageRef,
|
||||
Hostname: hostname,
|
||||
Labels: map[string]string{
|
||||
"com.galaxy.owner": "rtmanager",
|
||||
"com.galaxy.kind": "game-engine",
|
||||
"com.galaxy.game_id": gameID,
|
||||
"com.galaxy.engine_image_ref": env.EngineImageRef,
|
||||
"com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
|
||||
},
|
||||
Env: []string{
|
||||
"GAME_STATE_PATH=/var/lib/galaxy-game",
|
||||
"STORAGE_PATH=/var/lib/galaxy-game",
|
||||
},
|
||||
}
|
||||
hostCfg := &dockercontainer.HostConfig{}
|
||||
netCfg := &network.NetworkingConfig{
|
||||
EndpointsConfig: map[string]*network.EndpointSettings{
|
||||
env.Network: {Aliases: []string{hostname}},
|
||||
},
|
||||
}
|
||||
containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID)
|
||||
created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer removeCancel()
|
||||
_ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true})
|
||||
})
|
||||
|
||||
require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{}))
|
||||
return created.ID
|
||||
}
|
||||
Reference in New Issue
Block a user