// Package rtmanagernotification_test exercises the Runtime Manager → // Notification Service boundary against real RTM + real Notification + // real Mail Service + real User Service running on testcontainers // PostgreSQL and Redis, with a real Docker daemon for RTM's readiness // pings. // // The boundary contract under test is: when a start job points at an // unresolvable image, RTM publishes one `runtime.image_pull_failed` // admin-only notification intent on `notification:intents`; the // Notification Service consumes the intent, resolves the admin email // recipient list from configuration, and hands the delivery to Mail // Service in template-mode. The suite asserts the wire shape on // `notification:intents` and the resulting Mail delivery record. // // Game Master is not booted: RTM emits the intent itself; Notification // resolves the audience from `NOTIFICATION_ADMIN_EMAILS_*`; the // scenario needs no user-targeted resolution. package rtmanagernotification_test import ( "bytes" "context" "encoding/json" "errors" "fmt" "io" "net/http" "net/url" "os" "path/filepath" "runtime" "strconv" "strings" "sync/atomic" "testing" "time" "galaxy/integration/internal/harness" "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) const ( intentsStreamPrefix = "notification:intents" startJobsStreamPrefix = "runtime:start_jobs" stopJobsStreamPrefix = "runtime:stop_jobs" jobResultsStreamPrefix = "runtime:job_results" healthEventsStreamPrefix = "runtime:health_events" mailDeliveriesPath = "/api/v1/internal/deliveries" notificationTypeImagePull = "runtime.image_pull_failed" notificationTypeStartFailed = "runtime.container_start_failed" notificationTypeConfigInval = "runtime.start_config_invalid" expectedAdminEmailRecipient = "rtm-admin@example.com" expectedRTMProducer = "runtime_manager" missingImageRef = "galaxy/integration-missing:0.0.0" ) var suiteSeq atomic.Int64 // TestRTMImagePullFailureFlowsThroughNotificationToMail drives Runtime // Manager with a start envelope pointing at an unresolvable image // reference, then asserts: // // 1. RTM publishes one `runtime.image_pull_failed` intent on // `notification:intents` with the frozen admin payload. // 2. The Notification Service consumes it and fans out the matching // mail delivery to the configured admin recipient. // 3. Mail Service records the delivery with the right template id, // idempotency key, and template variables. // // The path covers the full producer → orchestrator → transport // pipeline that `TESTING.md §7` requests as the // `Runtime Manager ↔ Notification` boundary suite. func TestRTMImagePullFailureFlowsThroughNotificationToMail(t *testing.T) { h := newRTMNotificationHarness(t) gameID := uniqueGameID(t) h.publishStartJob(t, gameID, missingImageRef) // Step 1 — RTM publishes the admin notification intent. intent := h.waitForIntent(t, notificationTypeImagePull, gameID, 30*time.Second, ) assert.Equal(t, expectedRTMProducer, intent.Producer) assert.Equal(t, "admin_email", intent.AudienceKind) assert.Equal(t, gameID, intent.PayloadGameID) assert.Equal(t, missingImageRef, intent.PayloadImageRef) assert.Equal(t, "image_pull_failed", intent.PayloadErrorCode) assert.NotEmpty(t, intent.PayloadErrorMessage, "intent payload must carry operator-readable detail") assert.NotZero(t, intent.PayloadAttemptedAtMS) // Step 2 — Notification routes to Mail; Mail sends the delivery. idempotencyKey := "notification:" + intent.RedisEntryID + "/email:email:" + expectedAdminEmailRecipient delivery := h.eventuallyDelivery(t, url.Values{ "source": []string{"notification"}, "status": []string{"sent"}, "recipient": []string{expectedAdminEmailRecipient}, "template_id": []string{notificationTypeImagePull}, "idempotency_key": []string{idempotencyKey}, }) assert.Equal(t, "template", delivery.PayloadMode) assert.Equal(t, notificationTypeImagePull, delivery.TemplateID) assert.Equal(t, []string{expectedAdminEmailRecipient}, delivery.To) detail := h.getDelivery(t, delivery.DeliveryID) assert.Equal(t, "notification", detail.Source) assert.Equal(t, "template", detail.PayloadMode) assert.Equal(t, notificationTypeImagePull, detail.TemplateID) assert.Equal(t, idempotencyKey, detail.IdempotencyKey) assert.Equal(t, []string{expectedAdminEmailRecipient}, detail.To) require.NotNil(t, detail.TemplateVariables, "mail delivery must record template variables for admin triage") assert.Equal(t, gameID, detail.TemplateVariables["game_id"]) assert.Equal(t, missingImageRef, detail.TemplateVariables["image_ref"]) assert.Equal(t, "image_pull_failed", detail.TemplateVariables["error_code"]) } // rtmNotificationHarness owns the per-test infrastructure: shared // Redis, four real binaries (RTM, Notification, Mail, User), and the // per-test Docker network RTM's `/readyz` insists on. One harness per // test keeps each scenario fully isolated. type rtmNotificationHarness struct { redis *redis.Client rtmInternalURL string mailBaseURL string intentsStream string startJobsStream string stopJobsStream string jobResultsStream string healthEvents string rtmProcess *harness.Process notificationProcess *harness.Process mailProcess *harness.Process userServiceProcess *harness.Process } func newRTMNotificationHarness(t *testing.T) *rtmNotificationHarness { t.Helper() // `/readyz` of RTM pings the Docker daemon; skip the suite if no // Docker socket is reachable. harness.RequireDockerDaemon(t) redisRuntime := harness.StartRedisContainer(t) redisClient := redis.NewClient(&redis.Options{ Addr: redisRuntime.Addr, Protocol: 2, DisableIdentity: true, }) t.Cleanup(func() { require.NoError(t, redisClient.Close()) }) dockerNetwork := harness.EnsureDockerNetwork(t) userServiceAddr := harness.FreeTCPAddress(t) mailInternalAddr := harness.FreeTCPAddress(t) notificationInternalAddr := harness.FreeTCPAddress(t) rtmInternalAddr := harness.FreeTCPAddress(t) userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice") mailBinary := harness.BuildBinary(t, "mail", "./mail/cmd/mail") notificationBinary := harness.BuildBinary(t, "notification", "./notification/cmd/notification") rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager") // User Service: needed by Notification's port even though every // intent in this suite is admin-only. userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info" userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr userServiceEnv["OTEL_TRACES_EXPORTER"] = "none" userServiceEnv["OTEL_METRICS_EXPORTER"] = "none" userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv) waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr) // Per-test stream prefixes. suffix := strconv.FormatInt(suiteSeq.Add(1), 10) intentsStream := intentsStreamPrefix + ":" + suffix startJobsStream := startJobsStreamPrefix + ":" + suffix stopJobsStream := stopJobsStreamPrefix + ":" + suffix jobResultsStream := jobResultsStreamPrefix + ":" + suffix healthEvents := healthEventsStreamPrefix + ":" + suffix // Mail Service. mailEnv := harness.StartMailServicePersistence(t, redisRuntime.Addr).Env mailEnv["MAIL_LOG_LEVEL"] = "info" mailEnv["MAIL_INTERNAL_HTTP_ADDR"] = mailInternalAddr mailEnv["MAIL_TEMPLATE_DIR"] = mailTemplateDir(t) mailEnv["MAIL_SMTP_MODE"] = "stub" mailEnv["MAIL_STREAM_BLOCK_TIMEOUT"] = "100ms" mailEnv["MAIL_OPERATOR_REQUEST_TIMEOUT"] = time.Second.String() mailEnv["MAIL_SHUTDOWN_TIMEOUT"] = "2s" mailEnv["OTEL_TRACES_EXPORTER"] = "none" mailEnv["OTEL_METRICS_EXPORTER"] = "none" mailProcess := harness.StartProcess(t, "mail", mailBinary, mailEnv) waitForMailReady(t, mailProcess, "http://"+mailInternalAddr) // Notification Service. Admin-email envs route every runtime.* // intent to a shared rtm-admin recipient. notificationEnv := harness.StartNotificationServicePersistence(t, redisRuntime.Addr).Env notificationEnv["NOTIFICATION_LOG_LEVEL"] = "info" notificationEnv["NOTIFICATION_INTERNAL_HTTP_ADDR"] = notificationInternalAddr notificationEnv["NOTIFICATION_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr notificationEnv["NOTIFICATION_USER_SERVICE_TIMEOUT"] = time.Second.String() notificationEnv["NOTIFICATION_INTENTS_READ_BLOCK_TIMEOUT"] = "100ms" notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MIN"] = "100ms" notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MAX"] = "100ms" notificationEnv["NOTIFICATION_INTENTS_STREAM"] = intentsStream notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED"] = expectedAdminEmailRecipient notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED"] = expectedAdminEmailRecipient notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID"] = expectedAdminEmailRecipient notificationEnv["OTEL_TRACES_EXPORTER"] = "none" notificationEnv["OTEL_METRICS_EXPORTER"] = "none" notificationProcess := harness.StartProcess(t, "notification", notificationBinary, notificationEnv) harness.WaitForHTTPStatus(t, notificationProcess, "http://"+notificationInternalAddr+"/readyz", http.StatusOK) // Runtime Manager. Lobby base URL points at notification's // ready-probe path so RTM's start-service ancillary GetGame call // resolves to a valid 200/404 surface even though no Lobby is // running. The start service treats the response as best-effort // and never aborts on an unparseable body. rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env rtmEnv["RTMANAGER_LOG_LEVEL"] = "info" rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://127.0.0.1:1" rtmEnv["RTMANAGER_LOBBY_INTERNAL_TIMEOUT"] = "200ms" rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost() rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir() rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStream rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStream rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStream rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEvents rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms" rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "5s" rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "5s" rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "5s" rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "5s" rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s" rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3" rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "30" rtmEnv["RTMANAGER_IMAGE_PULL_POLICY"] = "if_missing" rtmEnv["OTEL_TRACES_EXPORTER"] = "none" rtmEnv["OTEL_METRICS_EXPORTER"] = "none" rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv) harness.WaitForHTTPStatus(t, rtmProcess, "http://"+rtmInternalAddr+"/readyz", http.StatusOK) return &rtmNotificationHarness{ redis: redisClient, rtmInternalURL: "http://" + rtmInternalAddr, mailBaseURL: "http://" + mailInternalAddr, intentsStream: intentsStream, startJobsStream: startJobsStream, stopJobsStream: stopJobsStream, jobResultsStream: jobResultsStream, healthEvents: healthEvents, rtmProcess: rtmProcess, notificationProcess: notificationProcess, mailProcess: mailProcess, userServiceProcess: userServiceProcess, } } func (h *rtmNotificationHarness) publishStartJob(t *testing.T, gameID, imageRef string) { t.Helper() _, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{ Stream: h.startJobsStream, Values: map[string]any{ "game_id": gameID, "image_ref": imageRef, "requested_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10), }, }).Result() require.NoError(t, err) } // observedIntent stores the decoded fields of one notification intent // entry that the suite cares about. type observedIntent struct { RedisEntryID string NotificationType string Producer string AudienceKind string PayloadGameID string PayloadImageRef string PayloadErrorCode string PayloadErrorMessage string PayloadAttemptedAtMS int64 } func (h *rtmNotificationHarness) waitForIntent( t *testing.T, notificationType, gameID string, timeout time.Duration, ) observedIntent { t.Helper() deadline := time.Now().Add(timeout) for { entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result() require.NoError(t, err) for _, entry := range entries { intent, ok := decodeIntent(entry) if !ok { continue } if intent.NotificationType != notificationType { continue } if intent.PayloadGameID != gameID { continue } return intent } if time.Now().After(deadline) { t.Fatalf("intent %s for game %s not observed on stream %s within %s\n%s", notificationType, gameID, h.intentsStream, timeout, h.rtmProcess.Logs()) } time.Sleep(50 * time.Millisecond) } } func decodeIntent(entry redis.XMessage) (observedIntent, bool) { notificationType, _ := entry.Values["notification_type"].(string) producer, _ := entry.Values["producer"].(string) audienceKind, _ := entry.Values["audience_kind"].(string) payloadJSON, _ := entry.Values["payload_json"].(string) if notificationType == "" { return observedIntent{}, false } out := observedIntent{ RedisEntryID: entry.ID, NotificationType: notificationType, Producer: producer, AudienceKind: audienceKind, } if payloadJSON == "" { return out, true } var payload struct { GameID string `json:"game_id"` ImageRef string `json:"image_ref"` ErrorCode string `json:"error_code"` ErrorMessage string `json:"error_message"` AttemptedAtMS int64 `json:"attempted_at_ms"` } if err := json.Unmarshal([]byte(payloadJSON), &payload); err == nil { out.PayloadGameID = payload.GameID out.PayloadImageRef = payload.ImageRef out.PayloadErrorCode = payload.ErrorCode out.PayloadErrorMessage = payload.ErrorMessage out.PayloadAttemptedAtMS = payload.AttemptedAtMS } return out, true } // mailDeliverySummary mirrors the public list-deliveries response of // Mail Service. type mailDeliverySummary struct { DeliveryID string `json:"delivery_id"` Source string `json:"source"` PayloadMode string `json:"payload_mode"` TemplateID string `json:"template_id"` Locale string `json:"locale"` To []string `json:"to"` Status string `json:"status"` } type mailDeliveryDetail struct { DeliveryID string `json:"delivery_id"` Source string `json:"source"` PayloadMode string `json:"payload_mode"` TemplateID string `json:"template_id"` Locale string `json:"locale"` To []string `json:"to"` IdempotencyKey string `json:"idempotency_key"` Status string `json:"status"` TemplateVariables map[string]any `json:"template_variables,omitempty"` } func (h *rtmNotificationHarness) eventuallyDelivery( t *testing.T, query url.Values, ) mailDeliverySummary { t.Helper() deadline := time.Now().Add(30 * time.Second) for { summary, found := h.findDelivery(t, query) if found { return summary } if time.Now().After(deadline) { t.Fatalf("mail delivery for query %v not observed within 30s\n%s", query, h.notificationProcess.Logs()) } time.Sleep(50 * time.Millisecond) } } func (h *rtmNotificationHarness) findDelivery( t *testing.T, query url.Values, ) (mailDeliverySummary, bool) { t.Helper() listURL := h.mailBaseURL + mailDeliveriesPath + "?" + query.Encode() req, err := http.NewRequest(http.MethodGet, listURL, nil) require.NoError(t, err) resp := doRequest(t, req) if resp.StatusCode != http.StatusOK { return mailDeliverySummary{}, false } var body struct { Items []mailDeliverySummary `json:"items"` } if err := json.Unmarshal([]byte(resp.Body), &body); err != nil { return mailDeliverySummary{}, false } if len(body.Items) == 0 { return mailDeliverySummary{}, false } return body.Items[0], true } func (h *rtmNotificationHarness) getDelivery(t *testing.T, deliveryID string) mailDeliveryDetail { t.Helper() req, err := http.NewRequest(http.MethodGet, h.mailBaseURL+mailDeliveriesPath+"/"+url.PathEscape(deliveryID), nil) require.NoError(t, err) resp := doRequest(t, req) require.Equalf(t, http.StatusOK, resp.StatusCode, "get delivery: %s", resp.Body) // Mail's detail response carries many fields the suite does not // assert on (cc, bcc, reply-to, attempt history, …). Use a // lenient decoder so additive contract changes do not break this // boundary test. var detail mailDeliveryDetail require.NoError(t, json.Unmarshal([]byte(resp.Body), &detail)) return detail } // --- shared helpers (mirror the conventions of integration/notificationmail) --- type httpResponse struct { StatusCode int Body string Header http.Header } func doRequest(t *testing.T, request *http.Request) httpResponse { t.Helper() client := &http.Client{ Timeout: 5 * time.Second, Transport: &http.Transport{DisableKeepAlives: true}, } t.Cleanup(client.CloseIdleConnections) response, err := client.Do(request) require.NoError(t, err) defer response.Body.Close() payload, err := io.ReadAll(response.Body) require.NoError(t, err) return httpResponse{ StatusCode: response.StatusCode, Body: string(payload), Header: response.Header.Clone(), } } func decodeStrictJSON(payload []byte, target any) error { decoder := json.NewDecoder(bytes.NewReader(payload)) decoder.DisallowUnknownFields() if err := decoder.Decode(target); err != nil { return err } if err := decoder.Decode(&struct{}{}); err != io.EOF { if err == nil { return errors.New("unexpected trailing JSON input") } return err } return nil } func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) { t.Helper() client := &http.Client{Timeout: 250 * time.Millisecond} t.Cleanup(client.CloseIdleConnections) deadline := time.Now().Add(10 * time.Second) for time.Now().Before(deadline) { req, err := http.NewRequest(http.MethodGet, baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil) require.NoError(t, err) response, err := client.Do(req) if err == nil { _, _ = io.Copy(io.Discard, response.Body) response.Body.Close() if response.StatusCode == http.StatusOK { return } } time.Sleep(25 * time.Millisecond) } t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs()) } func waitForMailReady(t *testing.T, process *harness.Process, baseURL string) { t.Helper() client := &http.Client{Timeout: 250 * time.Millisecond} t.Cleanup(client.CloseIdleConnections) deadline := time.Now().Add(10 * time.Second) for time.Now().Before(deadline) { req, err := http.NewRequest(http.MethodGet, baseURL+mailDeliveriesPath, nil) require.NoError(t, err) response, err := client.Do(req) if err == nil { _, _ = io.Copy(io.Discard, response.Body) response.Body.Close() if response.StatusCode == http.StatusOK { return } } time.Sleep(25 * time.Millisecond) } t.Fatalf("wait for mail readiness: timeout\n%s", process.Logs()) } func mailTemplateDir(t *testing.T) string { t.Helper() return filepath.Join(repositoryRoot(t), "mail", "templates") } func repositoryRoot(t *testing.T) string { t.Helper() _, file, _, ok := runtime.Caller(0) if !ok { t.Fatal("resolve repository root: runtime caller is unavailable") } return filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..")) } // uniqueGameID derives a deterministic, per-test, per-invocation game // id usable as the `game_id` field on `runtime:start_jobs` entries // without colliding when `-count` exceeds one. func uniqueGameID(t *testing.T) string { t.Helper() return fmt.Sprintf("game-%s-%d", sanitiseGameName(t.Name()), time.Now().UnixNano()) } func sanitiseGameName(name string) string { allowed := func(r rune) rune { switch { case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9': return r case r == '/' || r == '_' || r == '-': return '-' default: return -1 } } out := make([]rune, 0, len(name)) for _, r := range name { if mapped := allowed(r); mapped != -1 { out = append(out, mapped) } } return string(out) } // resolveDockerHost mirrors `rtmanager/integration/harness.runtime.go`: // honour DOCKER_HOST when the developer machine routes through colima // or a remote daemon, fall back to the standard unix path otherwise. func resolveDockerHost() string { if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" { return host } return "unix:///var/run/docker.sock" }