feat: runtime manager

This commit is contained in:
Ilia Denisov
2026-04-28 20:39:18 +02:00
committed by GitHub
parent e0a99b346b
commit a7cee15115
289 changed files with 45660 additions and 2207 deletions
@@ -0,0 +1,602 @@
// Package rtmanagernotification_test exercises the Runtime Manager →
// Notification Service boundary against real RTM + real Notification +
// real Mail Service + real User Service running on testcontainers
// PostgreSQL and Redis, with a real Docker daemon for RTM's readiness
// pings.
//
// The boundary contract under test is: when a start job points at an
// unresolvable image, RTM publishes one `runtime.image_pull_failed`
// admin-only notification intent on `notification:intents`; the
// Notification Service consumes the intent, resolves the admin email
// recipient list from configuration, and hands the delivery to Mail
// Service in template-mode. The suite asserts the wire shape on
// `notification:intents` and the resulting Mail delivery record.
//
// Game Master is not booted: RTM emits the intent itself; Notification
// resolves the audience from `NOTIFICATION_ADMIN_EMAILS_*`; the
// scenario needs no user-targeted resolution.
package rtmanagernotification_test
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync/atomic"
"testing"
"time"
"galaxy/integration/internal/harness"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const (
intentsStreamPrefix = "notification:intents"
startJobsStreamPrefix = "runtime:start_jobs"
stopJobsStreamPrefix = "runtime:stop_jobs"
jobResultsStreamPrefix = "runtime:job_results"
healthEventsStreamPrefix = "runtime:health_events"
mailDeliveriesPath = "/api/v1/internal/deliveries"
notificationTypeImagePull = "runtime.image_pull_failed"
notificationTypeStartFailed = "runtime.container_start_failed"
notificationTypeConfigInval = "runtime.start_config_invalid"
expectedAdminEmailRecipient = "rtm-admin@example.com"
expectedRTMProducer = "runtime_manager"
missingImageRef = "galaxy/integration-missing:0.0.0"
)
var suiteSeq atomic.Int64
// TestRTMImagePullFailureFlowsThroughNotificationToMail drives Runtime
// Manager with a start envelope pointing at an unresolvable image
// reference, then asserts:
//
// 1. RTM publishes one `runtime.image_pull_failed` intent on
// `notification:intents` with the frozen admin payload.
// 2. The Notification Service consumes it and fans out the matching
// mail delivery to the configured admin recipient.
// 3. Mail Service records the delivery with the right template id,
// idempotency key, and template variables.
//
// The path covers the full producer → orchestrator → transport
// pipeline that `TESTING.md §7` requests as the
// `Runtime Manager ↔ Notification` boundary suite.
func TestRTMImagePullFailureFlowsThroughNotificationToMail(t *testing.T) {
h := newRTMNotificationHarness(t)
gameID := uniqueGameID(t)
h.publishStartJob(t, gameID, missingImageRef)
// Step 1 — RTM publishes the admin notification intent.
intent := h.waitForIntent(t,
notificationTypeImagePull,
gameID,
30*time.Second,
)
assert.Equal(t, expectedRTMProducer, intent.Producer)
assert.Equal(t, "admin_email", intent.AudienceKind)
assert.Equal(t, gameID, intent.PayloadGameID)
assert.Equal(t, missingImageRef, intent.PayloadImageRef)
assert.Equal(t, "image_pull_failed", intent.PayloadErrorCode)
assert.NotEmpty(t, intent.PayloadErrorMessage,
"intent payload must carry operator-readable detail")
assert.NotZero(t, intent.PayloadAttemptedAtMS)
// Step 2 — Notification routes to Mail; Mail sends the delivery.
idempotencyKey := "notification:" + intent.RedisEntryID +
"/email:email:" + expectedAdminEmailRecipient
delivery := h.eventuallyDelivery(t, url.Values{
"source": []string{"notification"},
"status": []string{"sent"},
"recipient": []string{expectedAdminEmailRecipient},
"template_id": []string{notificationTypeImagePull},
"idempotency_key": []string{idempotencyKey},
})
assert.Equal(t, "template", delivery.PayloadMode)
assert.Equal(t, notificationTypeImagePull, delivery.TemplateID)
assert.Equal(t, []string{expectedAdminEmailRecipient}, delivery.To)
detail := h.getDelivery(t, delivery.DeliveryID)
assert.Equal(t, "notification", detail.Source)
assert.Equal(t, "template", detail.PayloadMode)
assert.Equal(t, notificationTypeImagePull, detail.TemplateID)
assert.Equal(t, idempotencyKey, detail.IdempotencyKey)
assert.Equal(t, []string{expectedAdminEmailRecipient}, detail.To)
require.NotNil(t, detail.TemplateVariables,
"mail delivery must record template variables for admin triage")
assert.Equal(t, gameID, detail.TemplateVariables["game_id"])
assert.Equal(t, missingImageRef, detail.TemplateVariables["image_ref"])
assert.Equal(t, "image_pull_failed", detail.TemplateVariables["error_code"])
}
// rtmNotificationHarness owns the per-test infrastructure: shared
// Redis, four real binaries (RTM, Notification, Mail, User), and the
// per-test Docker network RTM's `/readyz` insists on. One harness per
// test keeps each scenario fully isolated.
type rtmNotificationHarness struct {
redis *redis.Client
rtmInternalURL string
mailBaseURL string
intentsStream string
startJobsStream string
stopJobsStream string
jobResultsStream string
healthEvents string
rtmProcess *harness.Process
notificationProcess *harness.Process
mailProcess *harness.Process
userServiceProcess *harness.Process
}
func newRTMNotificationHarness(t *testing.T) *rtmNotificationHarness {
t.Helper()
// `/readyz` of RTM pings the Docker daemon; skip the suite if no
// Docker socket is reachable.
harness.RequireDockerDaemon(t)
redisRuntime := harness.StartRedisContainer(t)
redisClient := redis.NewClient(&redis.Options{
Addr: redisRuntime.Addr,
Protocol: 2,
DisableIdentity: true,
})
t.Cleanup(func() {
require.NoError(t, redisClient.Close())
})
dockerNetwork := harness.EnsureDockerNetwork(t)
userServiceAddr := harness.FreeTCPAddress(t)
mailInternalAddr := harness.FreeTCPAddress(t)
notificationInternalAddr := harness.FreeTCPAddress(t)
rtmInternalAddr := harness.FreeTCPAddress(t)
userServiceBinary := harness.BuildBinary(t, "userservice", "./user/cmd/userservice")
mailBinary := harness.BuildBinary(t, "mail", "./mail/cmd/mail")
notificationBinary := harness.BuildBinary(t, "notification", "./notification/cmd/notification")
rtmBinary := harness.BuildBinary(t, "rtmanager", "./rtmanager/cmd/rtmanager")
// User Service: needed by Notification's port even though every
// intent in this suite is admin-only.
userServiceEnv := harness.StartUserServicePersistence(t, redisRuntime.Addr).Env
userServiceEnv["USERSERVICE_LOG_LEVEL"] = "info"
userServiceEnv["USERSERVICE_INTERNAL_HTTP_ADDR"] = userServiceAddr
userServiceEnv["OTEL_TRACES_EXPORTER"] = "none"
userServiceEnv["OTEL_METRICS_EXPORTER"] = "none"
userServiceProcess := harness.StartProcess(t, "userservice", userServiceBinary, userServiceEnv)
waitForUserServiceReady(t, userServiceProcess, "http://"+userServiceAddr)
// Per-test stream prefixes.
suffix := strconv.FormatInt(suiteSeq.Add(1), 10)
intentsStream := intentsStreamPrefix + ":" + suffix
startJobsStream := startJobsStreamPrefix + ":" + suffix
stopJobsStream := stopJobsStreamPrefix + ":" + suffix
jobResultsStream := jobResultsStreamPrefix + ":" + suffix
healthEvents := healthEventsStreamPrefix + ":" + suffix
// Mail Service.
mailEnv := harness.StartMailServicePersistence(t, redisRuntime.Addr).Env
mailEnv["MAIL_LOG_LEVEL"] = "info"
mailEnv["MAIL_INTERNAL_HTTP_ADDR"] = mailInternalAddr
mailEnv["MAIL_TEMPLATE_DIR"] = mailTemplateDir(t)
mailEnv["MAIL_SMTP_MODE"] = "stub"
mailEnv["MAIL_STREAM_BLOCK_TIMEOUT"] = "100ms"
mailEnv["MAIL_OPERATOR_REQUEST_TIMEOUT"] = time.Second.String()
mailEnv["MAIL_SHUTDOWN_TIMEOUT"] = "2s"
mailEnv["OTEL_TRACES_EXPORTER"] = "none"
mailEnv["OTEL_METRICS_EXPORTER"] = "none"
mailProcess := harness.StartProcess(t, "mail", mailBinary, mailEnv)
waitForMailReady(t, mailProcess, "http://"+mailInternalAddr)
// Notification Service. Admin-email envs route every runtime.*
// intent to a shared rtm-admin recipient.
notificationEnv := harness.StartNotificationServicePersistence(t, redisRuntime.Addr).Env
notificationEnv["NOTIFICATION_LOG_LEVEL"] = "info"
notificationEnv["NOTIFICATION_INTERNAL_HTTP_ADDR"] = notificationInternalAddr
notificationEnv["NOTIFICATION_USER_SERVICE_BASE_URL"] = "http://" + userServiceAddr
notificationEnv["NOTIFICATION_USER_SERVICE_TIMEOUT"] = time.Second.String()
notificationEnv["NOTIFICATION_INTENTS_READ_BLOCK_TIMEOUT"] = "100ms"
notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MIN"] = "100ms"
notificationEnv["NOTIFICATION_ROUTE_BACKOFF_MAX"] = "100ms"
notificationEnv["NOTIFICATION_INTENTS_STREAM"] = intentsStream
notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_IMAGE_PULL_FAILED"] = expectedAdminEmailRecipient
notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_CONTAINER_START_FAILED"] = expectedAdminEmailRecipient
notificationEnv["NOTIFICATION_ADMIN_EMAILS_RUNTIME_START_CONFIG_INVALID"] = expectedAdminEmailRecipient
notificationEnv["OTEL_TRACES_EXPORTER"] = "none"
notificationEnv["OTEL_METRICS_EXPORTER"] = "none"
notificationProcess := harness.StartProcess(t, "notification", notificationBinary, notificationEnv)
harness.WaitForHTTPStatus(t, notificationProcess,
"http://"+notificationInternalAddr+"/readyz", http.StatusOK)
// Runtime Manager. Lobby base URL points at notification's
// ready-probe path so RTM's start-service ancillary GetGame call
// resolves to a valid 200/404 surface even though no Lobby is
// running. The start service treats the response as best-effort
// and never aborts on an unparseable body.
rtmEnv := harness.StartRTManagerServicePersistence(t, redisRuntime.Addr).Env
rtmEnv["RTMANAGER_LOG_LEVEL"] = "info"
rtmEnv["RTMANAGER_INTERNAL_HTTP_ADDR"] = rtmInternalAddr
rtmEnv["RTMANAGER_LOBBY_INTERNAL_BASE_URL"] = "http://127.0.0.1:1"
rtmEnv["RTMANAGER_LOBBY_INTERNAL_TIMEOUT"] = "200ms"
rtmEnv["RTMANAGER_DOCKER_HOST"] = resolveDockerHost()
rtmEnv["RTMANAGER_DOCKER_NETWORK"] = dockerNetwork
rtmEnv["RTMANAGER_GAME_STATE_ROOT"] = t.TempDir()
rtmEnv["RTMANAGER_REDIS_START_JOBS_STREAM"] = startJobsStream
rtmEnv["RTMANAGER_REDIS_STOP_JOBS_STREAM"] = stopJobsStream
rtmEnv["RTMANAGER_REDIS_JOB_RESULTS_STREAM"] = jobResultsStream
rtmEnv["RTMANAGER_REDIS_HEALTH_EVENTS_STREAM"] = healthEvents
rtmEnv["RTMANAGER_NOTIFICATION_INTENTS_STREAM"] = intentsStream
rtmEnv["RTMANAGER_STREAM_BLOCK_TIMEOUT"] = "200ms"
rtmEnv["RTMANAGER_RECONCILE_INTERVAL"] = "5s"
rtmEnv["RTMANAGER_CLEANUP_INTERVAL"] = "5s"
rtmEnv["RTMANAGER_INSPECT_INTERVAL"] = "5s"
rtmEnv["RTMANAGER_PROBE_INTERVAL"] = "5s"
rtmEnv["RTMANAGER_PROBE_TIMEOUT"] = "1s"
rtmEnv["RTMANAGER_PROBE_FAILURES_THRESHOLD"] = "3"
rtmEnv["RTMANAGER_GAME_LEASE_TTL_SECONDS"] = "30"
rtmEnv["RTMANAGER_IMAGE_PULL_POLICY"] = "if_missing"
rtmEnv["OTEL_TRACES_EXPORTER"] = "none"
rtmEnv["OTEL_METRICS_EXPORTER"] = "none"
rtmProcess := harness.StartProcess(t, "rtmanager", rtmBinary, rtmEnv)
harness.WaitForHTTPStatus(t, rtmProcess,
"http://"+rtmInternalAddr+"/readyz", http.StatusOK)
return &rtmNotificationHarness{
redis: redisClient,
rtmInternalURL: "http://" + rtmInternalAddr,
mailBaseURL: "http://" + mailInternalAddr,
intentsStream: intentsStream,
startJobsStream: startJobsStream,
stopJobsStream: stopJobsStream,
jobResultsStream: jobResultsStream,
healthEvents: healthEvents,
rtmProcess: rtmProcess,
notificationProcess: notificationProcess,
mailProcess: mailProcess,
userServiceProcess: userServiceProcess,
}
}
func (h *rtmNotificationHarness) publishStartJob(t *testing.T, gameID, imageRef string) {
t.Helper()
_, err := h.redis.XAdd(context.Background(), &redis.XAddArgs{
Stream: h.startJobsStream,
Values: map[string]any{
"game_id": gameID,
"image_ref": imageRef,
"requested_at_ms": strconv.FormatInt(time.Now().UnixMilli(), 10),
},
}).Result()
require.NoError(t, err)
}
// observedIntent stores the decoded fields of one notification intent
// entry that the suite cares about.
type observedIntent struct {
RedisEntryID string
NotificationType string
Producer string
AudienceKind string
PayloadGameID string
PayloadImageRef string
PayloadErrorCode string
PayloadErrorMessage string
PayloadAttemptedAtMS int64
}
func (h *rtmNotificationHarness) waitForIntent(
t *testing.T,
notificationType, gameID string,
timeout time.Duration,
) observedIntent {
t.Helper()
deadline := time.Now().Add(timeout)
for {
entries, err := h.redis.XRange(context.Background(), h.intentsStream, "-", "+").Result()
require.NoError(t, err)
for _, entry := range entries {
intent, ok := decodeIntent(entry)
if !ok {
continue
}
if intent.NotificationType != notificationType {
continue
}
if intent.PayloadGameID != gameID {
continue
}
return intent
}
if time.Now().After(deadline) {
t.Fatalf("intent %s for game %s not observed on stream %s within %s\n%s",
notificationType, gameID, h.intentsStream, timeout, h.rtmProcess.Logs())
}
time.Sleep(50 * time.Millisecond)
}
}
func decodeIntent(entry redis.XMessage) (observedIntent, bool) {
notificationType, _ := entry.Values["notification_type"].(string)
producer, _ := entry.Values["producer"].(string)
audienceKind, _ := entry.Values["audience_kind"].(string)
payloadJSON, _ := entry.Values["payload_json"].(string)
if notificationType == "" {
return observedIntent{}, false
}
out := observedIntent{
RedisEntryID: entry.ID,
NotificationType: notificationType,
Producer: producer,
AudienceKind: audienceKind,
}
if payloadJSON == "" {
return out, true
}
var payload struct {
GameID string `json:"game_id"`
ImageRef string `json:"image_ref"`
ErrorCode string `json:"error_code"`
ErrorMessage string `json:"error_message"`
AttemptedAtMS int64 `json:"attempted_at_ms"`
}
if err := json.Unmarshal([]byte(payloadJSON), &payload); err == nil {
out.PayloadGameID = payload.GameID
out.PayloadImageRef = payload.ImageRef
out.PayloadErrorCode = payload.ErrorCode
out.PayloadErrorMessage = payload.ErrorMessage
out.PayloadAttemptedAtMS = payload.AttemptedAtMS
}
return out, true
}
// mailDeliverySummary mirrors the public list-deliveries response of
// Mail Service.
type mailDeliverySummary struct {
DeliveryID string `json:"delivery_id"`
Source string `json:"source"`
PayloadMode string `json:"payload_mode"`
TemplateID string `json:"template_id"`
Locale string `json:"locale"`
To []string `json:"to"`
Status string `json:"status"`
}
type mailDeliveryDetail struct {
DeliveryID string `json:"delivery_id"`
Source string `json:"source"`
PayloadMode string `json:"payload_mode"`
TemplateID string `json:"template_id"`
Locale string `json:"locale"`
To []string `json:"to"`
IdempotencyKey string `json:"idempotency_key"`
Status string `json:"status"`
TemplateVariables map[string]any `json:"template_variables,omitempty"`
}
func (h *rtmNotificationHarness) eventuallyDelivery(
t *testing.T,
query url.Values,
) mailDeliverySummary {
t.Helper()
deadline := time.Now().Add(30 * time.Second)
for {
summary, found := h.findDelivery(t, query)
if found {
return summary
}
if time.Now().After(deadline) {
t.Fatalf("mail delivery for query %v not observed within 30s\n%s",
query, h.notificationProcess.Logs())
}
time.Sleep(50 * time.Millisecond)
}
}
func (h *rtmNotificationHarness) findDelivery(
t *testing.T,
query url.Values,
) (mailDeliverySummary, bool) {
t.Helper()
listURL := h.mailBaseURL + mailDeliveriesPath + "?" + query.Encode()
req, err := http.NewRequest(http.MethodGet, listURL, nil)
require.NoError(t, err)
resp := doRequest(t, req)
if resp.StatusCode != http.StatusOK {
return mailDeliverySummary{}, false
}
var body struct {
Items []mailDeliverySummary `json:"items"`
}
if err := json.Unmarshal([]byte(resp.Body), &body); err != nil {
return mailDeliverySummary{}, false
}
if len(body.Items) == 0 {
return mailDeliverySummary{}, false
}
return body.Items[0], true
}
func (h *rtmNotificationHarness) getDelivery(t *testing.T, deliveryID string) mailDeliveryDetail {
t.Helper()
req, err := http.NewRequest(http.MethodGet, h.mailBaseURL+mailDeliveriesPath+"/"+url.PathEscape(deliveryID), nil)
require.NoError(t, err)
resp := doRequest(t, req)
require.Equalf(t, http.StatusOK, resp.StatusCode, "get delivery: %s", resp.Body)
// Mail's detail response carries many fields the suite does not
// assert on (cc, bcc, reply-to, attempt history, …). Use a
// lenient decoder so additive contract changes do not break this
// boundary test.
var detail mailDeliveryDetail
require.NoError(t, json.Unmarshal([]byte(resp.Body), &detail))
return detail
}
// --- shared helpers (mirror the conventions of integration/notificationmail) ---
type httpResponse struct {
StatusCode int
Body string
Header http.Header
}
func doRequest(t *testing.T, request *http.Request) httpResponse {
t.Helper()
client := &http.Client{
Timeout: 5 * time.Second,
Transport: &http.Transport{DisableKeepAlives: true},
}
t.Cleanup(client.CloseIdleConnections)
response, err := client.Do(request)
require.NoError(t, err)
defer response.Body.Close()
payload, err := io.ReadAll(response.Body)
require.NoError(t, err)
return httpResponse{
StatusCode: response.StatusCode,
Body: string(payload),
Header: response.Header.Clone(),
}
}
func decodeStrictJSON(payload []byte, target any) error {
decoder := json.NewDecoder(bytes.NewReader(payload))
decoder.DisallowUnknownFields()
if err := decoder.Decode(target); err != nil {
return err
}
if err := decoder.Decode(&struct{}{}); err != io.EOF {
if err == nil {
return errors.New("unexpected trailing JSON input")
}
return err
}
return nil
}
func waitForUserServiceReady(t *testing.T, process *harness.Process, baseURL string) {
t.Helper()
client := &http.Client{Timeout: 250 * time.Millisecond}
t.Cleanup(client.CloseIdleConnections)
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
req, err := http.NewRequest(http.MethodGet,
baseURL+"/api/v1/internal/users/user-readiness-probe/exists", nil)
require.NoError(t, err)
response, err := client.Do(req)
if err == nil {
_, _ = io.Copy(io.Discard, response.Body)
response.Body.Close()
if response.StatusCode == http.StatusOK {
return
}
}
time.Sleep(25 * time.Millisecond)
}
t.Fatalf("wait for userservice readiness: timeout\n%s", process.Logs())
}
func waitForMailReady(t *testing.T, process *harness.Process, baseURL string) {
t.Helper()
client := &http.Client{Timeout: 250 * time.Millisecond}
t.Cleanup(client.CloseIdleConnections)
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
req, err := http.NewRequest(http.MethodGet, baseURL+mailDeliveriesPath, nil)
require.NoError(t, err)
response, err := client.Do(req)
if err == nil {
_, _ = io.Copy(io.Discard, response.Body)
response.Body.Close()
if response.StatusCode == http.StatusOK {
return
}
}
time.Sleep(25 * time.Millisecond)
}
t.Fatalf("wait for mail readiness: timeout\n%s", process.Logs())
}
func mailTemplateDir(t *testing.T) string {
t.Helper()
return filepath.Join(repositoryRoot(t), "mail", "templates")
}
func repositoryRoot(t *testing.T) string {
t.Helper()
_, file, _, ok := runtime.Caller(0)
if !ok {
t.Fatal("resolve repository root: runtime caller is unavailable")
}
return filepath.Clean(filepath.Join(filepath.Dir(file), "..", ".."))
}
// uniqueGameID derives a deterministic, per-test, per-invocation game
// id usable as the `game_id` field on `runtime:start_jobs` entries
// without colliding when `-count` exceeds one.
func uniqueGameID(t *testing.T) string {
t.Helper()
return fmt.Sprintf("game-%s-%d", sanitiseGameName(t.Name()), time.Now().UnixNano())
}
func sanitiseGameName(name string) string {
allowed := func(r rune) rune {
switch {
case r >= 'a' && r <= 'z',
r >= 'A' && r <= 'Z',
r >= '0' && r <= '9':
return r
case r == '/' || r == '_' || r == '-':
return '-'
default:
return -1
}
}
out := make([]rune, 0, len(name))
for _, r := range name {
if mapped := allowed(r); mapped != -1 {
out = append(out, mapped)
}
}
return string(out)
}
// resolveDockerHost mirrors `rtmanager/integration/harness.runtime.go`:
// honour DOCKER_HOST when the developer machine routes through colima
// or a remote daemon, fall back to the standard unix path otherwise.
func resolveDockerHost() string {
if host := strings.TrimSpace(os.Getenv("DOCKER_HOST")); host != "" {
return host
}
return "unix:///var/run/docker.sock"
}