201 lines
8.2 KiB
Go
201 lines
8.2 KiB
Go
//go:build integration
|
|
|
|
package integration_test
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
"testing"
|
|
"time"
|
|
|
|
"galaxy/notificationintent"
|
|
"galaxy/rtmanager/integration/harness"
|
|
"galaxy/rtmanager/internal/domain/health"
|
|
"galaxy/rtmanager/internal/domain/operation"
|
|
"galaxy/rtmanager/internal/domain/runtime"
|
|
"galaxy/rtmanager/internal/ports"
|
|
"galaxy/rtmanager/internal/service/startruntime"
|
|
|
|
dockercontainer "github.com/docker/docker/api/types/container"
|
|
"github.com/docker/docker/api/types/network"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestHealth_ContainerDisappearedAndAdopt verifies the two
|
|
// drift-detection paths. The Docker events listener emits
|
|
// `container_disappeared` when a tracked container is destroyed
|
|
// outside RTM, and the reconciler adopts a fresh container labelled
|
|
// `com.galaxy.owner=rtmanager` that has no PG row.
|
|
//
|
|
// `runtime_records.status=removed` is terminal per
|
|
// `runtime.AllowedTransitions`; the adoption path therefore uses a
|
|
// **fresh** game_id rather than re-adopting the disposed one. That
|
|
// matches the documented contract: reconciler adopts containers
|
|
// labelled `com.galaxy.owner=rtmanager` for which no PG row exists.
|
|
func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) {
|
|
env := harness.NewEnv(t, harness.EnvOptions{
|
|
ReconcileInterval: 500 * time.Millisecond,
|
|
})
|
|
|
|
// Step 1 — bring a game to running through the start consumer.
|
|
disposalGameID := harness.IDFromTestName(t) + "-d"
|
|
harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef)
|
|
startResult := harness.WaitForJobResult(t, env,
|
|
harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess),
|
|
30*time.Second,
|
|
)
|
|
originalContainerID := startResult.ContainerID
|
|
require.NotEmpty(t, originalContainerID)
|
|
|
|
// Step 2 — externally remove the container; the events listener
|
|
// should observe the destroy and publish `container_disappeared`.
|
|
removeContainer(t, env, originalContainerID)
|
|
disappeared := harness.WaitForHealthEvent(t, env,
|
|
harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)),
|
|
20*time.Second,
|
|
)
|
|
assert.Equal(t, originalContainerID, disappeared.ContainerID)
|
|
|
|
// The reconciler also marks the runtime record as removed within
|
|
// one or two ticks (`reconcile_dispose`).
|
|
harness.EventuallyRuntimeRecord(t, env, disposalGameID,
|
|
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved },
|
|
15*time.Second,
|
|
)
|
|
harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second)
|
|
|
|
// Step 3 — bring up an adoption candidate for an unseen game id
|
|
// by hand. The reconciler must label-match it, find no record,
|
|
// and insert one with status=running.
|
|
adoptionGameID := harness.IDFromTestName(t) + "-a"
|
|
manualContainerID := runManualEngineContainer(t, env, adoptionGameID)
|
|
t.Logf("manual container id=%s", manualContainerID)
|
|
|
|
adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID,
|
|
func(r runtime.RuntimeRecord) bool {
|
|
return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID
|
|
},
|
|
20*time.Second,
|
|
)
|
|
assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef)
|
|
|
|
adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second)
|
|
assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome)
|
|
assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource)
|
|
assert.Equal(t, manualContainerID, adoptEntry.ContainerID)
|
|
}
|
|
|
|
// TestNotification_ImagePullFailed drives Runtime Manager with a
|
|
// start envelope pointing at an unresolvable image reference. The
|
|
// start service must surface the failure on `runtime:job_results` and
|
|
// publish a `runtime.image_pull_failed` admin notification on
|
|
// `notification:intents`.
|
|
func TestNotification_ImagePullFailed(t *testing.T) {
|
|
env := harness.NewEnv(t, harness.EnvOptions{})
|
|
gameID := harness.IDFromTestName(t)
|
|
|
|
const missingImage = "galaxy/integration-missing:0.0.0"
|
|
harness.XAddStartJob(t, env, gameID, missingImage)
|
|
|
|
// Job result publishes a failure with the stable image_pull_failed
|
|
// code.
|
|
jobResult := harness.WaitForJobResult(t, env,
|
|
harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure),
|
|
60*time.Second,
|
|
)
|
|
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode)
|
|
assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id")
|
|
assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint")
|
|
assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message")
|
|
|
|
// Notification stream carries the matching admin-only intent.
|
|
intent := harness.WaitForNotificationIntent(t, env,
|
|
func(entry harness.NotificationIntentEntry) bool {
|
|
if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) {
|
|
return false
|
|
}
|
|
payloadGameID, _ := entry.Payload["game_id"].(string)
|
|
return payloadGameID == gameID
|
|
},
|
|
30*time.Second,
|
|
)
|
|
require.NotNil(t, intent.Payload, "notification intent must carry a payload")
|
|
assert.Equal(t, gameID, intent.Payload["game_id"])
|
|
assert.Equal(t, missingImage, intent.Payload["image_ref"])
|
|
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"])
|
|
|
|
// PG state: no running record was installed; operation_log
|
|
// captures one failed start with the stable error code.
|
|
_, err := harness.RuntimeRecord(t, env, gameID)
|
|
if err == nil {
|
|
// If an entry was upserted (rollback gap), it must not be
|
|
// running.
|
|
record := harness.MustRuntimeRecord(t, env, gameID)
|
|
assert.NotEqual(t, runtime.StatusRunning, record.Status,
|
|
"failed image pull must not leave a running record behind")
|
|
}
|
|
|
|
failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
|
|
assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome)
|
|
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode)
|
|
}
|
|
|
|
// removeContainer terminates and removes the container behind RTM's
|
|
// back. Force=true is required because the engine has not received a
|
|
// SIGTERM and stop signal handling is engine-internal.
|
|
func removeContainer(t *testing.T, env *harness.Env, containerID string) {
|
|
t.Helper()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true}))
|
|
}
|
|
|
|
// runManualEngineContainer bypasses RTM and starts an engine container
|
|
// directly through the Docker SDK. The container carries every label
|
|
// the reconciler reads at adopt time (`com.galaxy.owner`,
|
|
// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`,
|
|
// `com.galaxy.started_at_ms`) plus the per-game hostname so the
|
|
// computed `engine_endpoint` matches what `rtmanager` would have
|
|
// written.
|
|
func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string {
|
|
t.Helper()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
|
defer cancel()
|
|
|
|
hostname := "galaxy-game-" + gameID
|
|
cfg := &dockercontainer.Config{
|
|
Image: env.EngineImageRef,
|
|
Hostname: hostname,
|
|
Labels: map[string]string{
|
|
"com.galaxy.owner": "rtmanager",
|
|
"com.galaxy.kind": "game-engine",
|
|
"com.galaxy.game_id": gameID,
|
|
"com.galaxy.engine_image_ref": env.EngineImageRef,
|
|
"com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
|
|
},
|
|
Env: []string{
|
|
"GAME_STATE_PATH=/var/lib/galaxy-game",
|
|
"STORAGE_PATH=/var/lib/galaxy-game",
|
|
},
|
|
}
|
|
hostCfg := &dockercontainer.HostConfig{}
|
|
netCfg := &network.NetworkingConfig{
|
|
EndpointsConfig: map[string]*network.EndpointSettings{
|
|
env.Network: {Aliases: []string{hostname}},
|
|
},
|
|
}
|
|
containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID)
|
|
created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName)
|
|
require.NoError(t, err)
|
|
t.Cleanup(func() {
|
|
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer removeCancel()
|
|
_ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true})
|
|
})
|
|
|
|
require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{}))
|
|
return created.ID
|
|
}
|