chore: sync testing plan with gateway

2026-04-09 12:34:55 +02:00
parent c64c298d06
commit 9065b82fe2
5 changed files with 262 additions and 11 deletions
@@ -126,13 +126,21 @@ The testing plan follows this service order:
  * `GET /healthz`
  * `GET /readyz`
  * mounted public auth routes
  * wrong-method and not-found handling
  * public route-class classification for auth, browser bootstrap, browser asset, and misc traffic
  * isolation of browser/public-auth rate-limit buckets
  * rejection of oversized public request bodies
  * `RemoteAddr`-based public IP derivation that ignores forwarded proxy headers
  * public rate-limit behavior
  * stable projection of upstream public auth errors
  * sensitive-field redaction in public-auth logs
  * public OpenAPI contract validation
  * admin `/metrics` availability only on the private admin listener
 * Authenticated gRPC envelope validation tests:
  * missing required fields
  * unsupported `protocol_version`
  * parsed envelope attachment before delegate execution
  * malformed `payload_hash`
  * mismatched `payload_hash`
  * invalid signature
@@ -145,11 +153,15 @@ The testing plan follows this service order:
  * cache hit
  * cache miss
  * malformed cached record
  * read-through local-cache warming after first fallback lookup
  * local hit skips fallback lookup
  * cache invalidation/update handling
 * Response signing tests:
  * signed unary response generation
  * unary response fails closed when the response signer is unavailable
  * signed bootstrap push event generation
  * bootstrap push fails closed when the response signer is unavailable
  * signed stream event generation
 * Routing tests:
@@ -157,22 +169,35 @@ The testing plan follows this service order:
  * downstream timeout mapping
  * downstream availability mapping
  * authenticated internal command context construction
  * verified trace/span context propagation downstream
  * graceful drain of in-flight unary requests on shutdown
  * sensitive transport material redaction in authenticated logs
 * Push tests:
  * `SubscribeEvents` binds `user_id` and `device_session_id`
  * bootstrap server-time event is emitted
  * user-targeted events fan out to all matching user sessions
  * session-targeted events reach only the addressed session
  * stream queue overflow closes only the affected stream
  * revoked session closes matching streams only
  * revoked-session stream reopen is rejected
  * active streams close with deterministic status on gateway shutdown
 * Anti-abuse tests:
  * IP/session/user/message-class buckets
  * interaction between rate limits and verification order
  * authenticated/public anti-abuse bucket isolation
  * authenticated policy-hook input and reject mapping
 * Redis adapter tests:
  * session cache lookup
  * replay reservation
  * client event stream consumption
  * session event stream consumption
  * subscriber start-from-tail semantics
  * malformed-event drop/evict-and-continue behavior
  * later-event-wins behavior for session snapshots
  * subscriber shutdown interrupts blocking reads
 ### Inter-service integration tests at this stage
@@ -180,7 +205,10 @@ The testing plan follows this service order:
  * session cache compatibility
  * replay reservation semantics
-  * event stream consumption for push
+  * session update warms local cache without repeated fallback lookups
  * revoked snapshot invalidates authenticated requests without fallback lookup
  * client-event stream consumption for push fan-out
  * session-event stream consumption for revoke propagation and push teardown
 * `Gateway <-> stub Auth adapter`
  * public auth passthrough
@@ -194,7 +222,11 @@ The testing plan follows this service order:
 * Authenticated request verification pipeline remains stable.
 * Public auth routes remain mounted and deterministic.
 * Public route classes and anti-abuse buckets remain isolated.
 * Admin metrics stay off the public ingress.
 * Push bootstrap event remains signed and schema-compatible.
 * Push revoke and shutdown close streams with stable status mapping.
 * Gateway logs remain free of sensitive request/auth material.
 ---
@@ -207,6 +239,9 @@ The testing plan follows this service order:
  * challenge creation
  * TTL expiration
  * resend throttling
  * `delivery_throttled` challenge creation without `UserDirectory` or `MailSender` calls
  * `delivery_suppressed` behavior for blocked subjects
  * expiry grace-window transition from `challenge_expired` to `challenge_not_found`
  * delivery state transitions
  * invalid confirm attempt limits
  * success-shaped `send-email-code` behavior
@@ -218,7 +253,9 @@ The testing plan follows this service order:
  * existing user
  * creatable user
  * short-window idempotent confirm retry
  * projection repair on repeated confirm after prior publish failure
  * same challenge plus different public key failure
  * confirm-race cleanup of superseded sessions
  * session-limit exceeded
 * Session lifecycle tests:
@@ -226,13 +263,14 @@ The testing plan follows this service order:
  * revoke one session
  * revoke all sessions
  * block user/email and revoke implied sessions
-  * already-revoked and already-blocked idempotent results
+  * `already_revoked`, `no_active_sessions`, and `already_blocked` acknowledgement semantics
 * Projection tests:
  * source-of-truth session write
  * gateway KV snapshot write
  * gateway session stream event publish
  * repeated publish idempotency
  * stored session reread before publish to avoid stale active projection
 * Public API tests:
  * JSON decoding and unknown field rejection
@@ -269,6 +307,7 @@ The testing plan follows this service order:
  * login creates session
  * session projection becomes visible to gateway
  * repeated confirm repairs a previously failed projection publish
  * revoked session invalidates gateway authentication path
  * revoked session closes gateway push stream
 * `Auth / Session <-> stub Mail`
@@ -282,6 +321,8 @@ The testing plan follows this service order:
 * `confirm-email-code` always returns a ready `device_session_id`.
 * Gateway continues authenticating from cache rather than synchronous auth lookups.
 * Confirm idempotency window behavior remains stable.
 * Projection repair-on-retry remains safe after source-of-truth commits.
 * Confirm-race cleanup does not leave multiple active winner sessions.
 * Session projection remains compatible with gateway expectations.
 ---
@@ -295,6 +336,8 @@ The testing plan follows this service order:
  * create user
  * find by email
  * normalized email uniqueness
  * generated default `race_name` for new users
  * `race_name` uniqueness and confusable-substitution policy
  * role assignment
  * tariff/entitlement fields
 * Profile tests:
@@ -302,22 +345,30 @@ The testing plan follows this service order:
  * allowed profile reads
  * allowed profile edits
  * forbidden profile edits
  * self-service rejection for e-mail and `declared_country` mutations
  * `profile_update_block` sanction gating for profile/settings writes
  * settings reads/writes
  * BCP 47 and IANA validation for settings values
 * Restriction/sanction tests:
  * block flags
  * user limits
  * override fields
  * declared current sanctions view
  * effective sanction/limit snapshot shaping for downstream consumers
 * Entitlement tests:
  * free user
  * paid placeholder states
  * default simultaneous-game limit and per-user overrides
  * entitlement, sanction, and limit interaction rules
 * Internal/admin-oriented tests:
  * resolve existing/creatable/blocked decision for auth
  * `ensure-by-email` create-only `registration_context` semantics
  * current `declared_country` read/write path
  * exact lookup by `user_id`, normalized `email`, and `race_name`
  * paginated filtered listing with deterministic ordering
 * Storage and API contract tests:
  * public/trusted endpoints
@@ -346,6 +397,8 @@ The testing plan follows this service order:
 * User resolution outcomes remain stable for auth flow.
 * User-facing profile APIs do not bypass auth/session rules.
 * `registration_context` stays create-only and does not overwrite existing users.
 * `race_name` uniqueness policy remains stable for self-service and auth-created users.
 * User limit and sanction data stay compatible with downstream consumers.
 ---
@@ -733,35 +786,44 @@ The testing plan follows this service order:
  * enqueue authenticated observation
  * ingest validation
  * malformed FlatBuffers payload rejection
  * required-scalar-field validation
  * non-blocking acceptance
 * Worker pipeline tests:
  * geo lookup
  * geo lookup miss handling
  * country aggregation
  * `usual_connection_country` derivation
  * suspicious multi-country detection
  * review recommendation calculation
  * queue retry-safe processing
 * State tests:
  * durable `country_review_recommended`
  * declared-country version history
  * declared-country version lifecycle: `recorded`, `applied`, `sync_failed`
  * session block action history
 * Admin/query API tests:
  * list review candidates
  * stable ordering and pagination for candidate queries
  * read user geo profile
  * grouping by `device_session_id` in review/read responses
  * apply approved declared-country change
 * Queue and lag tests:
  * backlog observability
  * duplicate observation safety
  * delayed processing behavior
  * retry and failure observability
 ### Inter-service integration tests with already implemented components
 * `Gateway <-> Geo`
  * async observation publish from authenticated request context
  * fail-open edge behavior when geo ingest is unavailable
 * `Geo <-> Auth / Session`
  * suspicious session block request
@@ -779,6 +841,7 @@ The testing plan follows this service order:
 ### Regression tests to keep from this stage onward
 * Geo processing never blocks the current gameplay request.
 * Review-recommended state remains queryable even when event/mail side effects fail.
 * Session suspicion affects only later requests via auth/session.
 * Geo owns history, while user service owns current effective declared country.
@@ -234,16 +234,27 @@ func (g runningAuthenticatedGateway) stop(t *testing.T) {
 func dialGatewayClient(t *testing.T, addr string) *grpc.ClientConn {
 	t.Helper()
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	var conn *grpc.ClientConn
-	defer cancel()
+	require.Eventually(t, func() bool {
 		ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 		defer cancel()
-	conn, err := grpc.DialContext(
+		candidate, err := grpc.DialContext(
-		ctx,
+			ctx,
-		addr,
+			addr,
-		grpc.WithTransportCredentials(insecure.NewCredentials()),
+			grpc.WithTransportCredentials(insecure.NewCredentials()),
-		grpc.WithBlock(),
+			grpc.WithBlock(),
-	)
+		)
-	require.NoError(t, err)
+		if err != nil {
 			if candidate != nil {
 				_ = candidate.Close()
 			}
 			return false
 		}
 		conn = candidate
 		return true
 	}, 2*time.Second, 10*time.Millisecond, "gateway did not accept gRPC connections")
 	return conn
 }
@@ -8,6 +8,7 @@ import (
 	"time"
 	"galaxy/gateway/internal/authn"
 	"galaxy/gateway/internal/config"
 	"galaxy/gateway/internal/downstream"
 	"galaxy/gateway/internal/testutil"
 	gatewayv1 "galaxy/gateway/proto/galaxy/gateway/v1"
@@ -143,6 +144,78 @@ func TestExecuteCommandMapsDownstreamUnavailableToUnavailable(t *testing.T) {
 	assert.Equal(t, 1, failingClient.executeCalls)
 }
 func TestExecuteCommandMapsDownstreamTimeoutToUnavailable(t *testing.T) {
 	t.Parallel()
 	stallingClient := &recordingDownstreamClient{
 		executeFunc: func(ctx context.Context, _ downstream.AuthenticatedCommand) (downstream.UnaryResult, error) {
 			<-ctx.Done()
 			return downstream.UnaryResult{}, ctx.Err()
 		},
 	}
 	server, runGateway := newTestGatewayWithGRPCConfig(t, newAuthenticatedGRPCConfigForTest(func(cfg *config.AuthenticatedGRPCConfig) {
 		cfg.DownstreamTimeout = 50 * time.Millisecond
 	}), ServerDependencies{
 		Router: downstream.NewStaticRouter(map[string]downstream.Client{
 			"fleet.move": stallingClient,
 		}),
 		SessionCache:   userMappedSessionCache(map[string]string{"device-session-123": "user-123"}),
 		ReplayStore:    staticReplayStore{},
 		ResponseSigner: newTestResponseSigner(),
 	})
 	defer runGateway.stop(t)
 	addr := waitForListenAddr(t, server)
 	conn := dialGatewayClient(t, addr)
 	defer func() {
 		require.NoError(t, conn.Close())
 	}()
 	client := gatewayv1.NewEdgeGatewayClient(conn)
 	_, err := client.ExecuteCommand(context.Background(), newValidExecuteCommandRequest())
 	require.Error(t, err)
 	assert.Equal(t, codes.Unavailable, status.Code(err))
 	assert.Equal(t, "downstream service is unavailable", status.Convert(err).Message())
 	assert.Equal(t, 1, stallingClient.executeCalls)
 }
 func TestExecuteCommandFailsClosedWhenResponseSignerUnavailable(t *testing.T) {
 	t.Parallel()
 	successClient := &recordingDownstreamClient{
 		executeFunc: func(context.Context, downstream.AuthenticatedCommand) (downstream.UnaryResult, error) {
 			return downstream.UnaryResult{
 				ResultCode:   "accepted",
 				PayloadBytes: []byte("downstream-response"),
 			}, nil
 		},
 	}
 	server, runGateway := newTestGateway(t, ServerDependencies{
 		Router: downstream.NewStaticRouter(map[string]downstream.Client{
 			"fleet.move": successClient,
 		}),
 		ResponseSigner: unavailableResponseSigner{},
 		SessionCache:   userMappedSessionCache(map[string]string{"device-session-123": "user-123"}),
 		ReplayStore:    staticReplayStore{},
 	})
 	defer runGateway.stop(t)
 	addr := waitForListenAddr(t, server)
 	conn := dialGatewayClient(t, addr)
 	defer func() {
 		require.NoError(t, conn.Close())
 	}()
 	client := gatewayv1.NewEdgeGatewayClient(conn)
 	_, err := client.ExecuteCommand(context.Background(), newValidExecuteCommandRequest())
 	require.Error(t, err)
 	assert.Equal(t, codes.Unavailable, status.Code(err))
 	assert.Equal(t, "response signer is unavailable", status.Convert(err).Message())
 	assert.Equal(t, 1, successClient.executeCalls)
 }
 func TestExecuteCommandPropagatesOTelSpanContextToDownstream(t *testing.T) {
 	t.Parallel()
@@ -216,6 +216,33 @@ func TestSubscribeEventsMissingReplayStoreFailsClosed(t *testing.T) {
 	assert.Equal(t, "replay store is unavailable", status.Convert(err).Message())
 }
 func TestSubscribeEventsFailsClosedWhenResponseSignerUnavailable(t *testing.T) {
 	t.Parallel()
 	server, runGateway := newTestGateway(t, ServerDependencies{
 		ResponseSigner: unavailableResponseSigner{},
 		SessionCache: staticSessionCache{
 			lookupFunc: func(context.Context, string) (session.Record, error) {
 				return newActiveSessionRecord(), nil
 			},
 		},
 		ReplayStore: staticReplayStore{},
 	})
 	defer runGateway.stop(t)
 	addr := waitForListenAddr(t, server)
 	conn := dialGatewayClient(t, addr)
 	defer func() {
 		require.NoError(t, conn.Close())
 	}()
 	client := gatewayv1.NewEdgeGatewayClient(conn)
 	err := subscribeEventsError(t, context.Background(), client, newValidSubscribeEventsRequest())
 	require.Error(t, err)
 	assert.Equal(t, codes.Unavailable, status.Code(err))
 	assert.Equal(t, "response signer is unavailable", status.Convert(err).Message())
 }
 func TestServerLifecycle(t *testing.T) {
 	t.Parallel()
@@ -216,6 +216,83 @@ func TestPublicAntiAbuseBrowserClassBucketsStayIsolatedFromPublicAuth(t *testing
 	}
 }
 func TestPublicAntiAbuseUsesRemoteAddrInsteadOfForwardedHeaders(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name           string
 		headerKey      string
 		firstHeader    string
 		secondHeader   string
 		firstRemote    string
 		secondRemote   string
 		wantSecondCode int
 	}{
 		{
 			name:           "same remote addr ignores x-forwarded-for changes",
 			headerKey:      "X-Forwarded-For",
 			firstHeader:    "198.51.100.10",
 			secondHeader:   "198.51.100.11",
 			firstRemote:    "192.0.2.10:1234",
 			secondRemote:   "192.0.2.10:1234",
 			wantSecondCode: http.StatusTooManyRequests,
 		},
 		{
 			name:           "different remote addr wins over shared forwarded header",
 			headerKey:      "Forwarded",
 			firstHeader:    "for=198.51.100.10",
 			secondHeader:   "for=198.51.100.10",
 			firstRemote:    "192.0.2.10:1234",
 			secondRemote:   "192.0.2.11:1234",
 			wantSecondCode: http.StatusOK,
 		},
 	}
 	for _, tt := range tests {
 		tt := tt
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 			cfg := config.DefaultPublicHTTPConfig()
 			cfg.AntiAbuse.PublicAuth.RateLimit = config.PublicRateLimitConfig{
 				Requests: 1,
 				Window:   time.Hour,
 				Burst:    1,
 			}
 			cfg.AntiAbuse.SendEmailCodeIdentity.RateLimit = config.PublicRateLimitConfig{
 				Requests: 100,
 				Window:   time.Hour,
 				Burst:    100,
 			}
 			authService := &recordingAuthServiceClient{
 				sendEmailCodeResult: SendEmailCodeResult{
 					ChallengeID: "challenge-123",
 				},
 			}
 			handler := newPublicHandlerWithConfig(cfg, ServerDependencies{AuthService: authService})
 			first := sendEmailCodeRequest(`{"email":"pilot-one@example.com"}`)
 			first.RemoteAddr = tt.firstRemote
 			first.Header.Set(tt.headerKey, tt.firstHeader)
 			second := sendEmailCodeRequest(`{"email":"pilot-two@example.com"}`)
 			second.RemoteAddr = tt.secondRemote
 			second.Header.Set(tt.headerKey, tt.secondHeader)
 			firstResp := httptest.NewRecorder()
 			handler.ServeHTTP(firstResp, first)
 			secondResp := httptest.NewRecorder()
 			handler.ServeHTTP(secondResp, second)
 			assert.Equal(t, http.StatusOK, firstResp.Code)
 			assert.Equal(t, tt.wantSecondCode, secondResp.Code)
 		})
 	}
 }
 func TestPublicAntiAbuseSendEmailIdentityThrottle(t *testing.T) {
 	t.Parallel()