//go:build integration package integration_test import ( "context" "fmt" "strconv" "testing" "time" "galaxy/notificationintent" "galaxy/rtmanager/integration/harness" "galaxy/rtmanager/internal/domain/health" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" dockercontainer "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/network" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // TestHealth_ContainerDisappearedAndAdopt verifies the two // drift-detection paths. The Docker events listener emits // `container_disappeared` when a tracked container is destroyed // outside RTM, and the reconciler adopts a fresh container labelled // `com.galaxy.owner=rtmanager` that has no PG row. // // `runtime_records.status=removed` is terminal per // `runtime.AllowedTransitions`; the adoption path therefore uses a // **fresh** game_id rather than re-adopting the disposed one. That // matches the documented contract: reconciler adopts containers // labelled `com.galaxy.owner=rtmanager` for which no PG row exists. func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) { env := harness.NewEnv(t, harness.EnvOptions{ ReconcileInterval: 500 * time.Millisecond, }) // Step 1 — bring a game to running through the start consumer. disposalGameID := harness.IDFromTestName(t) + "-d" harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef) startResult := harness.WaitForJobResult(t, env, harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess), 30*time.Second, ) originalContainerID := startResult.ContainerID require.NotEmpty(t, originalContainerID) // Step 2 — externally remove the container; the events listener // should observe the destroy and publish `container_disappeared`. removeContainer(t, env, originalContainerID) disappeared := harness.WaitForHealthEvent(t, env, harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)), 20*time.Second, ) assert.Equal(t, originalContainerID, disappeared.ContainerID) // The reconciler also marks the runtime record as removed within // one or two ticks (`reconcile_dispose`). harness.EventuallyRuntimeRecord(t, env, disposalGameID, func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved }, 15*time.Second, ) harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second) // Step 3 — bring up an adoption candidate for an unseen game id // by hand. The reconciler must label-match it, find no record, // and insert one with status=running. adoptionGameID := harness.IDFromTestName(t) + "-a" manualContainerID := runManualEngineContainer(t, env, adoptionGameID) t.Logf("manual container id=%s", manualContainerID) adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID, func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID }, 20*time.Second, ) assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef) adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second) assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome) assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource) assert.Equal(t, manualContainerID, adoptEntry.ContainerID) } // TestNotification_ImagePullFailed drives Runtime Manager with a // start envelope pointing at an unresolvable image reference. The // start service must surface the failure on `runtime:job_results` and // publish a `runtime.image_pull_failed` admin notification on // `notification:intents`. func TestNotification_ImagePullFailed(t *testing.T) { env := harness.NewEnv(t, harness.EnvOptions{}) gameID := harness.IDFromTestName(t) const missingImage = "galaxy/integration-missing:0.0.0" harness.XAddStartJob(t, env, gameID, missingImage) // Job result publishes a failure with the stable image_pull_failed // code. jobResult := harness.WaitForJobResult(t, env, harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure), 60*time.Second, ) assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode) assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id") assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint") assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message") // Notification stream carries the matching admin-only intent. intent := harness.WaitForNotificationIntent(t, env, func(entry harness.NotificationIntentEntry) bool { if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) { return false } payloadGameID, _ := entry.Payload["game_id"].(string) return payloadGameID == gameID }, 30*time.Second, ) require.NotNil(t, intent.Payload, "notification intent must carry a payload") assert.Equal(t, gameID, intent.Payload["game_id"]) assert.Equal(t, missingImage, intent.Payload["image_ref"]) assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"]) // PG state: no running record was installed; operation_log // captures one failed start with the stable error code. _, err := harness.RuntimeRecord(t, env, gameID) if err == nil { // If an entry was upserted (rollback gap), it must not be // running. record := harness.MustRuntimeRecord(t, env, gameID) assert.NotEqual(t, runtime.StatusRunning, record.Status, "failed image pull must not leave a running record behind") } failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second) assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome) assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode) } // removeContainer terminates and removes the container behind RTM's // back. Force=true is required because the engine has not received a // SIGTERM and stop signal handling is engine-internal. func removeContainer(t *testing.T, env *harness.Env, containerID string) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true})) } // runManualEngineContainer bypasses RTM and starts an engine container // directly through the Docker SDK. The container carries every label // the reconciler reads at adopt time (`com.galaxy.owner`, // `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`, // `com.galaxy.started_at_ms`) plus the per-game hostname so the // computed `engine_endpoint` matches what `rtmanager` would have // written. func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() hostname := "galaxy-game-" + gameID cfg := &dockercontainer.Config{ Image: env.EngineImageRef, Hostname: hostname, Labels: map[string]string{ "com.galaxy.owner": "rtmanager", "com.galaxy.kind": "game-engine", "com.galaxy.game_id": gameID, "com.galaxy.engine_image_ref": env.EngineImageRef, "com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10), }, Env: []string{ "GAME_STATE_PATH=/var/lib/galaxy-game", "STORAGE_PATH=/var/lib/galaxy-game", }, } hostCfg := &dockercontainer.HostConfig{} netCfg := &network.NetworkingConfig{ EndpointsConfig: map[string]*network.EndpointSettings{ env.Network: {Aliases: []string{hostname}}, }, } containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID) created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName) require.NoError(t, err) t.Cleanup(func() { removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second) defer removeCancel() _ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true}) }) require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{})) return created.ID }