Files
galaxy-game/rtmanager/integration/monitoring_test.go
T
2026-04-28 20:39:18 +02:00

201 lines
8.2 KiB
Go

//go:build integration
package integration_test
import (
"context"
"fmt"
"strconv"
"testing"
"time"
"galaxy/notificationintent"
"galaxy/rtmanager/integration/harness"
"galaxy/rtmanager/internal/domain/health"
"galaxy/rtmanager/internal/domain/operation"
"galaxy/rtmanager/internal/domain/runtime"
"galaxy/rtmanager/internal/ports"
"galaxy/rtmanager/internal/service/startruntime"
dockercontainer "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/network"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestHealth_ContainerDisappearedAndAdopt verifies the two
// drift-detection paths. The Docker events listener emits
// `container_disappeared` when a tracked container is destroyed
// outside RTM, and the reconciler adopts a fresh container labelled
// `com.galaxy.owner=rtmanager` that has no PG row.
//
// `runtime_records.status=removed` is terminal per
// `runtime.AllowedTransitions`; the adoption path therefore uses a
// **fresh** game_id rather than re-adopting the disposed one. That
// matches the documented contract: reconciler adopts containers
// labelled `com.galaxy.owner=rtmanager` for which no PG row exists.
func TestHealth_ContainerDisappearedAndAdopt(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{
ReconcileInterval: 500 * time.Millisecond,
})
// Step 1 — bring a game to running through the start consumer.
disposalGameID := harness.IDFromTestName(t) + "-d"
harness.XAddStartJob(t, env, disposalGameID, env.EngineImageRef)
startResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(disposalGameID, ports.JobOutcomeSuccess),
30*time.Second,
)
originalContainerID := startResult.ContainerID
require.NotEmpty(t, originalContainerID)
// Step 2 — externally remove the container; the events listener
// should observe the destroy and publish `container_disappeared`.
removeContainer(t, env, originalContainerID)
disappeared := harness.WaitForHealthEvent(t, env,
harness.HealthEventTypeIs(disposalGameID, string(health.EventTypeContainerDisappeared)),
20*time.Second,
)
assert.Equal(t, originalContainerID, disappeared.ContainerID)
// The reconciler also marks the runtime record as removed within
// one or two ticks (`reconcile_dispose`).
harness.EventuallyRuntimeRecord(t, env, disposalGameID,
func(r runtime.RuntimeRecord) bool { return r.Status == runtime.StatusRemoved },
15*time.Second,
)
harness.EventuallyOperationKind(t, env, disposalGameID, operation.OpKindReconcileDispose, 5*time.Second)
// Step 3 — bring up an adoption candidate for an unseen game id
// by hand. The reconciler must label-match it, find no record,
// and insert one with status=running.
adoptionGameID := harness.IDFromTestName(t) + "-a"
manualContainerID := runManualEngineContainer(t, env, adoptionGameID)
t.Logf("manual container id=%s", manualContainerID)
adopted := harness.EventuallyRuntimeRecord(t, env, adoptionGameID,
func(r runtime.RuntimeRecord) bool {
return r.Status == runtime.StatusRunning && r.CurrentContainerID == manualContainerID
},
20*time.Second,
)
assert.Equal(t, env.EngineImageRef, adopted.CurrentImageRef)
adoptEntry := harness.EventuallyOperationKind(t, env, adoptionGameID, operation.OpKindReconcileAdopt, 5*time.Second)
assert.Equal(t, operation.OutcomeSuccess, adoptEntry.Outcome)
assert.Equal(t, operation.OpSourceAutoReconcile, adoptEntry.OpSource)
assert.Equal(t, manualContainerID, adoptEntry.ContainerID)
}
// TestNotification_ImagePullFailed drives Runtime Manager with a
// start envelope pointing at an unresolvable image reference. The
// start service must surface the failure on `runtime:job_results` and
// publish a `runtime.image_pull_failed` admin notification on
// `notification:intents`.
func TestNotification_ImagePullFailed(t *testing.T) {
env := harness.NewEnv(t, harness.EnvOptions{})
gameID := harness.IDFromTestName(t)
const missingImage = "galaxy/integration-missing:0.0.0"
harness.XAddStartJob(t, env, gameID, missingImage)
// Job result publishes a failure with the stable image_pull_failed
// code.
jobResult := harness.WaitForJobResult(t, env,
harness.JobOutcomeIs(gameID, ports.JobOutcomeFailure),
60*time.Second,
)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, jobResult.ErrorCode)
assert.Empty(t, jobResult.ContainerID, "failure must not surface a container id")
assert.Empty(t, jobResult.EngineEndpoint, "failure must not surface an engine endpoint")
assert.NotEmpty(t, jobResult.ErrorMessage, "failure must carry an operator-readable message")
// Notification stream carries the matching admin-only intent.
intent := harness.WaitForNotificationIntent(t, env,
func(entry harness.NotificationIntentEntry) bool {
if entry.NotificationType != string(notificationintent.NotificationTypeRuntimeImagePullFailed) {
return false
}
payloadGameID, _ := entry.Payload["game_id"].(string)
return payloadGameID == gameID
},
30*time.Second,
)
require.NotNil(t, intent.Payload, "notification intent must carry a payload")
assert.Equal(t, gameID, intent.Payload["game_id"])
assert.Equal(t, missingImage, intent.Payload["image_ref"])
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, intent.Payload["error_code"])
// PG state: no running record was installed; operation_log
// captures one failed start with the stable error code.
_, err := harness.RuntimeRecord(t, env, gameID)
if err == nil {
// If an entry was upserted (rollback gap), it must not be
// running.
record := harness.MustRuntimeRecord(t, env, gameID)
assert.NotEqual(t, runtime.StatusRunning, record.Status,
"failed image pull must not leave a running record behind")
}
failureEntry := harness.EventuallyOperationKind(t, env, gameID, operation.OpKindStart, 5*time.Second)
assert.Equal(t, operation.OutcomeFailure, failureEntry.Outcome)
assert.Equal(t, startruntime.ErrorCodeImagePullFailed, failureEntry.ErrorCode)
}
// removeContainer terminates and removes the container behind RTM's
// back. Force=true is required because the engine has not received a
// SIGTERM and stop signal handling is engine-internal.
func removeContainer(t *testing.T, env *harness.Env, containerID string) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
require.NoError(t, env.Docker.Client().ContainerRemove(ctx, containerID, dockercontainer.RemoveOptions{Force: true}))
}
// runManualEngineContainer bypasses RTM and starts an engine container
// directly through the Docker SDK. The container carries every label
// the reconciler reads at adopt time (`com.galaxy.owner`,
// `com.galaxy.kind`, `com.galaxy.game_id`, `com.galaxy.engine_image_ref`,
// `com.galaxy.started_at_ms`) plus the per-game hostname so the
// computed `engine_endpoint` matches what `rtmanager` would have
// written.
func runManualEngineContainer(t *testing.T, env *harness.Env, gameID string) string {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
hostname := "galaxy-game-" + gameID
cfg := &dockercontainer.Config{
Image: env.EngineImageRef,
Hostname: hostname,
Labels: map[string]string{
"com.galaxy.owner": "rtmanager",
"com.galaxy.kind": "game-engine",
"com.galaxy.game_id": gameID,
"com.galaxy.engine_image_ref": env.EngineImageRef,
"com.galaxy.started_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
},
Env: []string{
"GAME_STATE_PATH=/var/lib/galaxy-game",
"STORAGE_PATH=/var/lib/galaxy-game",
},
}
hostCfg := &dockercontainer.HostConfig{}
netCfg := &network.NetworkingConfig{
EndpointsConfig: map[string]*network.EndpointSettings{
env.Network: {Aliases: []string{hostname}},
},
}
containerName := fmt.Sprintf("galaxy-game-%s-manual", gameID)
created, err := env.Docker.Client().ContainerCreate(ctx, cfg, hostCfg, netCfg, nil, containerName)
require.NoError(t, err)
t.Cleanup(func() {
removeCtx, removeCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer removeCancel()
_ = env.Docker.Client().ContainerRemove(removeCtx, created.ID, dockercontainer.RemoveOptions{Force: true})
})
require.NoError(t, env.Docker.Client().ContainerStart(ctx, created.ID, dockercontainer.StartOptions{}))
return created.ID
}