chore: sync testing plan with gateway

2026-04-09 12:34:55 +02:00
parent c64c298d06
commit 9065b82fe2
5 changed files with 262 additions and 11 deletions
@@ -126,13 +126,21 @@ The testing plan follows this service order:
  * `GET /healthz`
  * `GET /readyz`
  * mounted public auth routes
+  * wrong-method and not-found handling
+  * public route-class classification for auth, browser bootstrap, browser asset, and misc traffic
+  * isolation of browser/public-auth rate-limit buckets
  * rejection of oversized public request bodies
+  * `RemoteAddr`-based public IP derivation that ignores forwarded proxy headers
  * public rate-limit behavior
  * stable projection of upstream public auth errors
+  * sensitive-field redaction in public-auth logs
+  * public OpenAPI contract validation
+  * admin `/metrics` availability only on the private admin listener
 * Authenticated gRPC envelope validation tests:

  * missing required fields
  * unsupported `protocol_version`
+  * parsed envelope attachment before delegate execution
  * malformed `payload_hash`
  * mismatched `payload_hash`
  * invalid signature
@@ -145,11 +153,15 @@ The testing plan follows this service order:
  * cache hit
  * cache miss
  * malformed cached record
+  * read-through local-cache warming after first fallback lookup
+  * local hit skips fallback lookup
  * cache invalidation/update handling
 * Response signing tests:

  * signed unary response generation
+  * unary response fails closed when the response signer is unavailable
  * signed bootstrap push event generation
+  * bootstrap push fails closed when the response signer is unavailable
  * signed stream event generation
 * Routing tests:

@@ -157,22 +169,35 @@ The testing plan follows this service order:
  * downstream timeout mapping
  * downstream availability mapping
  * authenticated internal command context construction
+  * verified trace/span context propagation downstream
+  * graceful drain of in-flight unary requests on shutdown
+  * sensitive transport material redaction in authenticated logs
 * Push tests:

  * `SubscribeEvents` binds `user_id` and `device_session_id`
  * bootstrap server-time event is emitted
+  * user-targeted events fan out to all matching user sessions
+  * session-targeted events reach only the addressed session
  * stream queue overflow closes only the affected stream
  * revoked session closes matching streams only
+  * revoked-session stream reopen is rejected
+  * active streams close with deterministic status on gateway shutdown
 * Anti-abuse tests:

  * IP/session/user/message-class buckets
  * interaction between rate limits and verification order
+  * authenticated/public anti-abuse bucket isolation
+  * authenticated policy-hook input and reject mapping
 * Redis adapter tests:

  * session cache lookup
  * replay reservation
  * client event stream consumption
  * session event stream consumption
+  * subscriber start-from-tail semantics
+  * malformed-event drop/evict-and-continue behavior
+  * later-event-wins behavior for session snapshots
+  * subscriber shutdown interrupts blocking reads

 ### Inter-service integration tests at this stage

@@ -180,7 +205,10 @@ The testing plan follows this service order:

  * session cache compatibility
  * replay reservation semantics
-  * event stream consumption for push
+  * session update warms local cache without repeated fallback lookups
+  * revoked snapshot invalidates authenticated requests without fallback lookup
+  * client-event stream consumption for push fan-out
+  * session-event stream consumption for revoke propagation and push teardown
 * `Gateway <-> stub Auth adapter`

  * public auth passthrough
@@ -194,7 +222,11 @@ The testing plan follows this service order:

 * Authenticated request verification pipeline remains stable.
 * Public auth routes remain mounted and deterministic.
+* Public route classes and anti-abuse buckets remain isolated.
+* Admin metrics stay off the public ingress.
 * Push bootstrap event remains signed and schema-compatible.
+* Push revoke and shutdown close streams with stable status mapping.
+* Gateway logs remain free of sensitive request/auth material.

 ---

@@ -207,6 +239,9 @@ The testing plan follows this service order:
  * challenge creation
  * TTL expiration
  * resend throttling
+  * `delivery_throttled` challenge creation without `UserDirectory` or `MailSender` calls
+  * `delivery_suppressed` behavior for blocked subjects
+  * expiry grace-window transition from `challenge_expired` to `challenge_not_found`
  * delivery state transitions
  * invalid confirm attempt limits
  * success-shaped `send-email-code` behavior
@@ -218,7 +253,9 @@ The testing plan follows this service order:
  * existing user
  * creatable user
  * short-window idempotent confirm retry
+  * projection repair on repeated confirm after prior publish failure
  * same challenge plus different public key failure
+  * confirm-race cleanup of superseded sessions
  * session-limit exceeded
 * Session lifecycle tests:

@@ -226,13 +263,14 @@ The testing plan follows this service order:
  * revoke one session
  * revoke all sessions
  * block user/email and revoke implied sessions
-  * already-revoked and already-blocked idempotent results
+  * `already_revoked`, `no_active_sessions`, and `already_blocked` acknowledgement semantics
 * Projection tests:

  * source-of-truth session write
  * gateway KV snapshot write
  * gateway session stream event publish
  * repeated publish idempotency
+  * stored session reread before publish to avoid stale active projection
 * Public API tests:

  * JSON decoding and unknown field rejection
@@ -269,6 +307,7 @@ The testing plan follows this service order:

  * login creates session
  * session projection becomes visible to gateway
+  * repeated confirm repairs a previously failed projection publish
  * revoked session invalidates gateway authentication path
  * revoked session closes gateway push stream
 * `Auth / Session <-> stub Mail`
@@ -282,6 +321,8 @@ The testing plan follows this service order:
 * `confirm-email-code` always returns a ready `device_session_id`.
 * Gateway continues authenticating from cache rather than synchronous auth lookups.
 * Confirm idempotency window behavior remains stable.
+* Projection repair-on-retry remains safe after source-of-truth commits.
+* Confirm-race cleanup does not leave multiple active winner sessions.
 * Session projection remains compatible with gateway expectations.

 ---
@@ -295,6 +336,8 @@ The testing plan follows this service order:
  * create user
  * find by email
  * normalized email uniqueness
+  * generated default `race_name` for new users
+  * `race_name` uniqueness and confusable-substitution policy
  * role assignment
  * tariff/entitlement fields
 * Profile tests:
@@ -302,22 +345,30 @@ The testing plan follows this service order:
  * allowed profile reads
  * allowed profile edits
  * forbidden profile edits
+  * self-service rejection for e-mail and `declared_country` mutations
+  * `profile_update_block` sanction gating for profile/settings writes
  * settings reads/writes
+  * BCP 47 and IANA validation for settings values
 * Restriction/sanction tests:

  * block flags
  * user limits
  * override fields
  * declared current sanctions view
+  * effective sanction/limit snapshot shaping for downstream consumers
 * Entitlement tests:

  * free user
  * paid placeholder states
  * default simultaneous-game limit and per-user overrides
+  * entitlement, sanction, and limit interaction rules
 * Internal/admin-oriented tests:

  * resolve existing/creatable/blocked decision for auth
+  * `ensure-by-email` create-only `registration_context` semantics
  * current `declared_country` read/write path
+  * exact lookup by `user_id`, normalized `email`, and `race_name`
+  * paginated filtered listing with deterministic ordering
 * Storage and API contract tests:

  * public/trusted endpoints
@@ -346,6 +397,8 @@ The testing plan follows this service order:

 * User resolution outcomes remain stable for auth flow.
 * User-facing profile APIs do not bypass auth/session rules.
+* `registration_context` stays create-only and does not overwrite existing users.
+* `race_name` uniqueness policy remains stable for self-service and auth-created users.
 * User limit and sanction data stay compatible with downstream consumers.

 ---
@@ -733,35 +786,44 @@ The testing plan follows this service order:

  * enqueue authenticated observation
  * ingest validation
+  * malformed FlatBuffers payload rejection
+  * required-scalar-field validation
  * non-blocking acceptance
 * Worker pipeline tests:

  * geo lookup
+  * geo lookup miss handling
  * country aggregation
  * `usual_connection_country` derivation
  * suspicious multi-country detection
  * review recommendation calculation
+  * queue retry-safe processing
 * State tests:

  * durable `country_review_recommended`
  * declared-country version history
+  * declared-country version lifecycle: `recorded`, `applied`, `sync_failed`
  * session block action history
 * Admin/query API tests:

  * list review candidates
+  * stable ordering and pagination for candidate queries
  * read user geo profile
+  * grouping by `device_session_id` in review/read responses
  * apply approved declared-country change
 * Queue and lag tests:

  * backlog observability
  * duplicate observation safety
  * delayed processing behavior
+  * retry and failure observability

 ### Inter-service integration tests with already implemented components

 * `Gateway <-> Geo`

  * async observation publish from authenticated request context
+  * fail-open edge behavior when geo ingest is unavailable
 * `Geo <-> Auth / Session`

  * suspicious session block request
@@ -779,6 +841,7 @@ The testing plan follows this service order:
 ### Regression tests to keep from this stage onward

 * Geo processing never blocks the current gameplay request.
+* Review-recommended state remains queryable even when event/mail side effects fail.
 * Session suspicion affects only later requests via auth/session.
 * Geo owns history, while user service owns current effective declared country.

@@ -234,16 +234,27 @@ func (g runningAuthenticatedGateway) stop(t *testing.T) {
 func dialGatewayClient(t *testing.T, addr string) *grpc.ClientConn {
 	t.Helper()

-	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	var conn *grpc.ClientConn
+	require.Eventually(t, func() bool {
+		ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 		defer cancel()

-	conn, err := grpc.DialContext(
+		candidate, err := grpc.DialContext(
 			ctx,
 			addr,
 			grpc.WithTransportCredentials(insecure.NewCredentials()),
 			grpc.WithBlock(),
 		)
-	require.NoError(t, err)
+		if err != nil {
+			if candidate != nil {
+				_ = candidate.Close()
+			}
+			return false
+		}
+
+		conn = candidate
+		return true
+	}, 2*time.Second, 10*time.Millisecond, "gateway did not accept gRPC connections")

 	return conn
 }
@@ -8,6 +8,7 @@ import (
 	"time"

 	"galaxy/gateway/internal/authn"
+	"galaxy/gateway/internal/config"
 	"galaxy/gateway/internal/downstream"
 	"galaxy/gateway/internal/testutil"
 	gatewayv1 "galaxy/gateway/proto/galaxy/gateway/v1"
@@ -143,6 +144,78 @@ func TestExecuteCommandMapsDownstreamUnavailableToUnavailable(t *testing.T) {
 	assert.Equal(t, 1, failingClient.executeCalls)
 }

+func TestExecuteCommandMapsDownstreamTimeoutToUnavailable(t *testing.T) {
+	t.Parallel()
+
+	stallingClient := &recordingDownstreamClient{
+		executeFunc: func(ctx context.Context, _ downstream.AuthenticatedCommand) (downstream.UnaryResult, error) {
+			<-ctx.Done()
+			return downstream.UnaryResult{}, ctx.Err()
+		},
+	}
+
+	server, runGateway := newTestGatewayWithGRPCConfig(t, newAuthenticatedGRPCConfigForTest(func(cfg *config.AuthenticatedGRPCConfig) {
+		cfg.DownstreamTimeout = 50 * time.Millisecond
+	}), ServerDependencies{
+		Router: downstream.NewStaticRouter(map[string]downstream.Client{
+			"fleet.move": stallingClient,
+		}),
+		SessionCache:   userMappedSessionCache(map[string]string{"device-session-123": "user-123"}),
+		ReplayStore:    staticReplayStore{},
+		ResponseSigner: newTestResponseSigner(),
+	})
+	defer runGateway.stop(t)
+
+	addr := waitForListenAddr(t, server)
+	conn := dialGatewayClient(t, addr)
+	defer func() {
+		require.NoError(t, conn.Close())
+	}()
+
+	client := gatewayv1.NewEdgeGatewayClient(conn)
+	_, err := client.ExecuteCommand(context.Background(), newValidExecuteCommandRequest())
+	require.Error(t, err)
+	assert.Equal(t, codes.Unavailable, status.Code(err))
+	assert.Equal(t, "downstream service is unavailable", status.Convert(err).Message())
+	assert.Equal(t, 1, stallingClient.executeCalls)
+}
+
+func TestExecuteCommandFailsClosedWhenResponseSignerUnavailable(t *testing.T) {
+	t.Parallel()
+
+	successClient := &recordingDownstreamClient{
+		executeFunc: func(context.Context, downstream.AuthenticatedCommand) (downstream.UnaryResult, error) {
+			return downstream.UnaryResult{
+				ResultCode:   "accepted",
+				PayloadBytes: []byte("downstream-response"),
+			}, nil
+		},
+	}
+
+	server, runGateway := newTestGateway(t, ServerDependencies{
+		Router: downstream.NewStaticRouter(map[string]downstream.Client{
+			"fleet.move": successClient,
+		}),
+		ResponseSigner: unavailableResponseSigner{},
+		SessionCache:   userMappedSessionCache(map[string]string{"device-session-123": "user-123"}),
+		ReplayStore:    staticReplayStore{},
+	})
+	defer runGateway.stop(t)
+
+	addr := waitForListenAddr(t, server)
+	conn := dialGatewayClient(t, addr)
+	defer func() {
+		require.NoError(t, conn.Close())
+	}()
+
+	client := gatewayv1.NewEdgeGatewayClient(conn)
+	_, err := client.ExecuteCommand(context.Background(), newValidExecuteCommandRequest())
+	require.Error(t, err)
+	assert.Equal(t, codes.Unavailable, status.Code(err))
+	assert.Equal(t, "response signer is unavailable", status.Convert(err).Message())
+	assert.Equal(t, 1, successClient.executeCalls)
+}
+
 func TestExecuteCommandPropagatesOTelSpanContextToDownstream(t *testing.T) {
 	t.Parallel()

@@ -216,6 +216,33 @@ func TestSubscribeEventsMissingReplayStoreFailsClosed(t *testing.T) {
 	assert.Equal(t, "replay store is unavailable", status.Convert(err).Message())
 }

+func TestSubscribeEventsFailsClosedWhenResponseSignerUnavailable(t *testing.T) {
+	t.Parallel()
+
+	server, runGateway := newTestGateway(t, ServerDependencies{
+		ResponseSigner: unavailableResponseSigner{},
+		SessionCache: staticSessionCache{
+			lookupFunc: func(context.Context, string) (session.Record, error) {
+				return newActiveSessionRecord(), nil
+			},
+		},
+		ReplayStore: staticReplayStore{},
+	})
+	defer runGateway.stop(t)
+
+	addr := waitForListenAddr(t, server)
+	conn := dialGatewayClient(t, addr)
+	defer func() {
+		require.NoError(t, conn.Close())
+	}()
+
+	client := gatewayv1.NewEdgeGatewayClient(conn)
+	err := subscribeEventsError(t, context.Background(), client, newValidSubscribeEventsRequest())
+	require.Error(t, err)
+	assert.Equal(t, codes.Unavailable, status.Code(err))
+	assert.Equal(t, "response signer is unavailable", status.Convert(err).Message())
+}
+
 func TestServerLifecycle(t *testing.T) {
 	t.Parallel()

@@ -216,6 +216,83 @@ func TestPublicAntiAbuseBrowserClassBucketsStayIsolatedFromPublicAuth(t *testing
 	}
 }

+func TestPublicAntiAbuseUsesRemoteAddrInsteadOfForwardedHeaders(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name           string
+		headerKey      string
+		firstHeader    string
+		secondHeader   string
+		firstRemote    string
+		secondRemote   string
+		wantSecondCode int
+	}{
+		{
+			name:           "same remote addr ignores x-forwarded-for changes",
+			headerKey:      "X-Forwarded-For",
+			firstHeader:    "198.51.100.10",
+			secondHeader:   "198.51.100.11",
+			firstRemote:    "192.0.2.10:1234",
+			secondRemote:   "192.0.2.10:1234",
+			wantSecondCode: http.StatusTooManyRequests,
+		},
+		{
+			name:           "different remote addr wins over shared forwarded header",
+			headerKey:      "Forwarded",
+			firstHeader:    "for=198.51.100.10",
+			secondHeader:   "for=198.51.100.10",
+			firstRemote:    "192.0.2.10:1234",
+			secondRemote:   "192.0.2.11:1234",
+			wantSecondCode: http.StatusOK,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			cfg := config.DefaultPublicHTTPConfig()
+			cfg.AntiAbuse.PublicAuth.RateLimit = config.PublicRateLimitConfig{
+				Requests: 1,
+				Window:   time.Hour,
+				Burst:    1,
+			}
+			cfg.AntiAbuse.SendEmailCodeIdentity.RateLimit = config.PublicRateLimitConfig{
+				Requests: 100,
+				Window:   time.Hour,
+				Burst:    100,
+			}
+
+			authService := &recordingAuthServiceClient{
+				sendEmailCodeResult: SendEmailCodeResult{
+					ChallengeID: "challenge-123",
+				},
+			}
+			handler := newPublicHandlerWithConfig(cfg, ServerDependencies{AuthService: authService})
+
+			first := sendEmailCodeRequest(`{"email":"pilot-one@example.com"}`)
+			first.RemoteAddr = tt.firstRemote
+			first.Header.Set(tt.headerKey, tt.firstHeader)
+
+			second := sendEmailCodeRequest(`{"email":"pilot-two@example.com"}`)
+			second.RemoteAddr = tt.secondRemote
+			second.Header.Set(tt.headerKey, tt.secondHeader)
+
+			firstResp := httptest.NewRecorder()
+			handler.ServeHTTP(firstResp, first)
+
+			secondResp := httptest.NewRecorder()
+			handler.ServeHTTP(secondResp, second)
+
+			assert.Equal(t, http.StatusOK, firstResp.Code)
+			assert.Equal(t, tt.wantSecondCode, secondResp.Code)
+		})
+	}
+}
+
 func TestPublicAntiAbuseSendEmailIdentityThrottle(t *testing.T) {
 	t.Parallel()