feat: backend service

This commit is contained in:
Ilia Denisov
2026-05-06 10:14:55 +03:00
committed by GitHub
parent 3e2622757e
commit f446c6a2ac
1486 changed files with 49720 additions and 266401 deletions
+48
View File
@@ -0,0 +1,48 @@
package push
import (
"fmt"
"strconv"
"sync/atomic"
)
// cursorWidth is the zero-padded decimal width applied to every cursor.
// 20 digits accommodate the full uint64 range so lexicographic order
// matches numeric order across the entire process lifetime.
const cursorWidth = 20
// cursorGenerator hands out monotonically increasing uint64 sequence
// numbers. Cursors restart from 0 on process boot; the ring buffer's
// freshness-window TTL bounds how long a cursor remains valid, so a
// fresh process intentionally invalidates every previously-issued
// cursor.
type cursorGenerator struct {
seq atomic.Uint64
}
// next returns the next sequence number. The first call returns 1.
func (g *cursorGenerator) next() uint64 {
return g.seq.Add(1)
}
// formatCursor renders n in the canonical zero-padded form so cursor
// strings sort identically to their numeric counterparts.
func formatCursor(n uint64) string {
return fmt.Sprintf("%0*d", cursorWidth, n)
}
// parseCursor decodes a cursor string back to its numeric value. An
// empty string maps to 0 ("subscribe from now"); malformed input also
// maps to 0 with ok=false so callers can log without rejecting the
// subscription — gateway is trusted but reconnects can race against a
// process restart that scrambled the in-memory sequence.
func parseCursor(s string) (uint64, bool) {
if s == "" {
return 0, true
}
n, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0, false
}
return n, true
}
+79
View File
@@ -0,0 +1,79 @@
package push
import (
"sync"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestCursorGeneratorMonotonicAndConcurrent(t *testing.T) {
t.Parallel()
var g cursorGenerator
const goroutines = 64
const perGoroutine = 1000
results := make(chan uint64, goroutines*perGoroutine)
var wg sync.WaitGroup
wg.Add(goroutines)
for range goroutines {
go func() {
defer wg.Done()
for range perGoroutine {
results <- g.next()
}
}()
}
wg.Wait()
close(results)
seen := make(map[uint64]struct{}, goroutines*perGoroutine)
var max uint64
for n := range results {
_, dup := seen[n]
require.Falsef(t, dup, "duplicate cursor %d", n)
seen[n] = struct{}{}
if n > max {
max = n
}
}
assert.EqualValues(t, goroutines*perGoroutine, max)
}
func TestFormatAndParseCursor(t *testing.T) {
t.Parallel()
cases := []struct {
in uint64
out string
}{
{0, "00000000000000000000"},
{1, "00000000000000000001"},
{1234567890, "00000000001234567890"},
}
for _, tc := range cases {
s := formatCursor(tc.in)
assert.Equal(t, tc.out, s)
assert.Len(t, s, cursorWidth)
n, ok := parseCursor(s)
require.True(t, ok)
assert.Equal(t, tc.in, n)
}
n, ok := parseCursor("")
assert.True(t, ok)
assert.Zero(t, n)
n, ok = parseCursor("not-a-number")
assert.False(t, ok)
assert.Zero(t, n)
}
func TestFormatCursorLexicographicOrder(t *testing.T) {
t.Parallel()
a := formatCursor(9)
b := formatCursor(10)
assert.Less(t, a, b, "lexicographic order must match numeric order")
}
+161
View File
@@ -0,0 +1,161 @@
package push
import (
"context"
"encoding/json"
"testing"
"time"
pushv1 "galaxy/backend/proto/push/v1"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func newTestService(t *testing.T) *Service {
t.Helper()
svc, err := NewService(ServiceConfig{
FreshnessWindow: time.Minute,
RingCapacity: 16,
PerConnBuffer: 8,
}, nil, nil)
require.NoError(t, err)
return svc
}
func TestPublishClientEventStampsCursorAndPayload(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
userID := uuid.New()
devID := uuid.New()
payload := map[string]any{"game_id": "g1", "n": 7.0}
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, &devID, "lobby.invite.received", payload, "route-1", "req-1", "trace-1"))
events, stale := svc.ring.since(0, time.Now())
require.False(t, stale)
require.Len(t, events, 1)
ev := events[0]
assert.Equal(t, formatCursor(1), ev.Cursor)
ce := ev.GetClientEvent()
require.NotNil(t, ce)
assert.Equal(t, userID.String(), ce.UserId)
assert.Equal(t, devID.String(), ce.DeviceSessionId)
assert.Equal(t, "lobby.invite.received", ce.Kind)
assert.Equal(t, "route-1", ce.EventId)
assert.Equal(t, "req-1", ce.RequestId)
assert.Equal(t, "trace-1", ce.TraceId)
var got map[string]any
require.NoError(t, json.Unmarshal(ce.Payload, &got))
assert.Equal(t, "g1", got["game_id"])
assert.EqualValues(t, 7.0, got["n"])
}
func TestPublishClientEventOmitsDeviceSessionWhenNil(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
userID := uuid.New()
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "x", nil, "", "", ""))
events, _ := svc.ring.since(0, time.Now())
require.Len(t, events, 1)
assert.Empty(t, events[0].GetClientEvent().DeviceSessionId)
}
func TestPublishClientEventRequiresUserAndKind(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
require.Error(t, svc.PublishClientEvent(context.Background(), uuid.Nil, nil, "k", nil, "", "", ""))
require.Error(t, svc.PublishClientEvent(context.Background(), uuid.New(), nil, " ", nil, "", "", ""))
}
func TestPublishSessionInvalidationStampsCursor(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
userID := uuid.New()
devID := uuid.New()
svc.PublishSessionInvalidation(context.Background(), devID, userID, "auth.revoke_session")
events, _ := svc.ring.since(0, time.Now())
require.Len(t, events, 1)
si := events[0].GetSessionInvalidation()
require.NotNil(t, si)
assert.Equal(t, userID.String(), si.UserId)
assert.Equal(t, devID.String(), si.DeviceSessionId)
assert.Equal(t, "auth.revoke_session", si.Reason)
}
func TestPublishSessionInvalidationFanOutOmitsDeviceSession(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
userID := uuid.New()
svc.PublishSessionInvalidation(context.Background(), uuid.Nil, userID, "auth.revoke_all_for_user")
events, _ := svc.ring.since(0, time.Now())
require.Len(t, events, 1)
si := events[0].GetSessionInvalidation()
assert.Empty(t, si.DeviceSessionId)
assert.Equal(t, userID.String(), si.UserId)
}
func TestPublishCursorMonotonic(t *testing.T) {
t.Parallel()
svc := newTestService(t)
t.Cleanup(svc.Close)
userID := uuid.New()
for range 5 {
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "k", nil, "", "", ""))
}
events, _ := svc.ring.since(0, time.Now())
require.Len(t, events, 5)
for i, ev := range events {
assert.Equal(t, formatCursor(uint64(i+1)), ev.Cursor)
}
}
func TestPublishOnClosedServiceIsNoop(t *testing.T) {
t.Parallel()
svc := newTestService(t)
svc.Close()
require.NoError(t, svc.PublishClientEvent(context.Background(), uuid.New(), nil, "k", nil, "", "", ""))
events, _ := svc.ring.since(0, time.Now())
assert.Empty(t, events)
}
// Compile-time interface checks: Service must satisfy the publisher
// contracts that internal/auth and internal/notification import.
var (
_ pushClientEventPublisher = (*Service)(nil)
_ pushSessionInvalidationEmitter = (*Service)(nil)
)
type pushClientEventPublisher interface {
PublishClientEvent(ctx context.Context, userID uuid.UUID, deviceSessionID *uuid.UUID, kind string, payload map[string]any, eventID, requestID, traceID string) error
}
type pushSessionInvalidationEmitter interface {
PublishSessionInvalidation(ctx context.Context, deviceSessionID, userID uuid.UUID, reason string)
}
// Make sure the publisher satisfies pushv1.PushServer at the type level.
var _ pushv1.PushServer = (*Service)(nil)
+108
View File
@@ -0,0 +1,108 @@
package push
import (
"time"
pushv1 "galaxy/backend/proto/push/v1"
)
// ringEntry is one event stored in the in-memory replay buffer. The
// cursor is duplicated here for O(1) comparison without re-parsing
// event.Cursor.
type ringEntry struct {
cursor uint64
addedAt time.Time
event *pushv1.PushEvent
}
// ring is the in-memory replay buffer. Entries are evicted by either
// freshness-window TTL or capacity, whichever triggers first. The ring
// is not safe for concurrent use; the owning Service serialises access
// under its mutex.
type ring struct {
capacity int
ttl time.Duration
entries []ringEntry
lastEvicted uint64 // largest cursor evicted from the buffer
hasLastEvicted bool
}
func newRing(capacity int, ttl time.Duration) *ring {
return &ring{
capacity: capacity,
ttl: ttl,
entries: make([]ringEntry, 0, capacity),
}
}
// append records ev with its cursor and evicts entries past TTL or
// capacity. The caller is responsible for setting ev.Cursor to
// formatCursor(cursor) before calling.
func (r *ring) append(cursor uint64, ev *pushv1.PushEvent, now time.Time) {
r.evictExpired(now)
for len(r.entries) >= r.capacity {
r.evictHead()
}
r.entries = append(r.entries, ringEntry{cursor: cursor, addedAt: now, event: ev})
}
// since returns the events with cursor strictly greater than fromCursor
// in ascending cursor order. The boolean is true when the requested
// cursor is "stale" — either older than the oldest retained event or
// older than the last evicted cursor — meaning the caller missed at
// least one event that the ring no longer holds. Stale callers receive
// no replay and must resume from the live tail.
func (r *ring) since(fromCursor uint64, now time.Time) ([]*pushv1.PushEvent, bool) {
r.evictExpired(now)
if len(r.entries) == 0 {
// An empty ring is never stale: gateway is either fully caught
// up or there has been no traffic.
return nil, false
}
if r.hasLastEvicted && fromCursor < r.lastEvicted {
return nil, true
}
first := r.entries[0].cursor
if fromCursor+1 < first {
return nil, true
}
out := make([]*pushv1.PushEvent, 0)
for i := range r.entries {
if r.entries[i].cursor > fromCursor {
out = append(out, r.entries[i].event)
}
}
return out, false
}
// len reports the current number of retained entries; intended for
// tests and metrics.
func (r *ring) len() int {
return len(r.entries)
}
func (r *ring) evictExpired(now time.Time) {
if r.ttl <= 0 {
return
}
cutoff := now.Add(-r.ttl)
drop := 0
for drop < len(r.entries) && r.entries[drop].addedAt.Before(cutoff) {
drop++
}
if drop == 0 {
return
}
r.lastEvicted = r.entries[drop-1].cursor
r.hasLastEvicted = true
r.entries = append(r.entries[:0], r.entries[drop:]...)
}
func (r *ring) evictHead() {
if len(r.entries) == 0 {
return
}
r.lastEvicted = r.entries[0].cursor
r.hasLastEvicted = true
r.entries = append(r.entries[:0], r.entries[1:]...)
}
+105
View File
@@ -0,0 +1,105 @@
package push
import (
"testing"
"time"
pushv1 "galaxy/backend/proto/push/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func mkEvent(cursor uint64, label string) *pushv1.PushEvent {
return &pushv1.PushEvent{
Cursor: formatCursor(cursor),
Kind: &pushv1.PushEvent_ClientEvent{
ClientEvent: &pushv1.ClientEvent{
Kind: label,
Payload: []byte(label),
},
},
}
}
func TestRingAppendAndSinceReturnsTail(t *testing.T) {
t.Parallel()
now := time.Unix(1_700_000_000, 0)
r := newRing(8, time.Minute)
for i := uint64(1); i <= 5; i++ {
r.append(i, mkEvent(i, "e"), now)
}
got, stale := r.since(2, now)
require.False(t, stale)
require.Len(t, got, 3)
assert.Equal(t, formatCursor(3), got[0].Cursor)
assert.Equal(t, formatCursor(4), got[1].Cursor)
assert.Equal(t, formatCursor(5), got[2].Cursor)
}
func TestRingSinceReturnsEmptyWhenCaughtUp(t *testing.T) {
t.Parallel()
now := time.Unix(1_700_000_000, 0)
r := newRing(8, time.Minute)
for i := uint64(1); i <= 3; i++ {
r.append(i, mkEvent(i, "e"), now)
}
got, stale := r.since(3, now)
require.False(t, stale)
assert.Empty(t, got)
got, stale = r.since(99, now)
require.False(t, stale)
assert.Empty(t, got)
}
func TestRingSinceFlagsStaleCursorBelowEvictedRange(t *testing.T) {
t.Parallel()
now := time.Unix(1_700_000_000, 0)
r := newRing(3, time.Minute)
for i := uint64(1); i <= 5; i++ {
r.append(i, mkEvent(i, "e"), now)
}
// Capacity=3 means cursors 1 and 2 were evicted.
require.Equal(t, 3, r.len())
got, stale := r.since(1, now)
assert.True(t, stale)
assert.Empty(t, got)
got, stale = r.since(2, now)
assert.False(t, stale)
require.Len(t, got, 3)
assert.Equal(t, formatCursor(3), got[0].Cursor)
}
func TestRingEvictsExpiredEntries(t *testing.T) {
t.Parallel()
t0 := time.Unix(1_700_000_000, 0)
r := newRing(8, 10*time.Second)
r.append(1, mkEvent(1, "e"), t0)
r.append(2, mkEvent(2, "e"), t0.Add(2*time.Second))
r.append(3, mkEvent(3, "e"), t0.Add(15*time.Second))
// At t0+13s the first two entries are past their 10s TTL but the
// third (added at t0+15s) is still within the freshness window.
got, stale := r.since(0, t0.Add(13*time.Second))
assert.True(t, stale)
assert.Empty(t, got)
assert.Equal(t, 1, r.len())
}
func TestRingEmptyIsNeverStale(t *testing.T) {
t.Parallel()
r := newRing(4, time.Minute)
got, stale := r.since(42, time.Now())
assert.False(t, stale)
assert.Empty(t, got)
}
+145
View File
@@ -0,0 +1,145 @@
// Package push hosts the backend gRPC listener used by gateway.
//
// Server owns the TCP listener and gRPC machinery. Service implements
// the PushServer interface and is registered against the gRPC server
// before Serve begins. On shutdown the server signals the service to
// drop its subscriptions, then performs the usual GracefulStop /
// forced-stop sequence.
package push
import (
"context"
"errors"
"fmt"
"net"
"sync"
"galaxy/backend/internal/config"
"galaxy/backend/internal/telemetry"
pushv1 "galaxy/backend/proto/push/v1"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.uber.org/zap"
"google.golang.org/grpc"
)
// Server owns the gRPC push listener.
type Server struct {
cfg config.GRPCPushConfig
svc *Service
logger *zap.Logger
runtime *telemetry.Runtime
stateMu sync.RWMutex
server *grpc.Server
listener net.Listener
}
// NewServer constructs a gRPC push server bound to cfg. svc must not be
// nil; it is registered as the pushv1.PushServer implementation when
// Run starts.
func NewServer(cfg config.GRPCPushConfig, svc *Service, logger *zap.Logger, runtime *telemetry.Runtime) *Server {
if logger == nil {
logger = zap.NewNop()
}
return &Server{
cfg: cfg,
svc: svc,
logger: logger.Named("grpc_push"),
runtime: runtime,
}
}
// Run binds the listener and serves the gRPC surface until Shutdown closes
// the server.
func (s *Server) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run backend gRPC push server: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
if s.svc == nil {
return errors.New("run backend gRPC push server: nil service")
}
listener, err := net.Listen("tcp", s.cfg.Addr)
if err != nil {
return fmt.Errorf("run backend gRPC push server: listen on %q: %w", s.cfg.Addr, err)
}
grpcServer := grpc.NewServer(
grpc.StatsHandler(otelgrpc.NewServerHandler()),
)
pushv1.RegisterPushServer(grpcServer, s.svc)
s.stateMu.Lock()
s.server = grpcServer
s.listener = listener
s.stateMu.Unlock()
s.logger.Info("backend gRPC push server started", zap.String("addr", listener.Addr().String()))
defer func() {
s.stateMu.Lock()
s.server = nil
s.listener = nil
s.stateMu.Unlock()
}()
err = grpcServer.Serve(listener)
switch {
case err == nil:
return nil
case errors.Is(err, grpc.ErrServerStopped):
s.logger.Info("backend gRPC push server stopped")
return nil
default:
return fmt.Errorf("run backend gRPC push server: serve on %q: %w", s.cfg.Addr, err)
}
}
// Shutdown attempts a graceful stop within ctx, falling back to a forced stop
// when ctx expires before GracefulStop returns. The configured per-listener
// timeout further bounds the wait. Active SubscribePush streams are closed
// first so GracefulStop is not blocked by long-lived server-streaming RPCs.
func (s *Server) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown backend gRPC push server: nil context")
}
s.stateMu.RLock()
server := s.server
s.stateMu.RUnlock()
if server == nil {
return nil
}
if s.svc != nil {
s.svc.Close()
}
shutdownCtx, cancel := context.WithCancel(ctx)
defer cancel()
if s.cfg.ShutdownTimeout > 0 {
shutdownCtx, cancel = context.WithTimeout(ctx, s.cfg.ShutdownTimeout)
defer cancel()
}
stopped := make(chan struct{})
go func() {
server.GracefulStop()
close(stopped)
}()
select {
case <-stopped:
return nil
case <-shutdownCtx.Done():
server.Stop()
<-stopped
return fmt.Errorf("shutdown backend gRPC push server: %w", shutdownCtx.Err())
}
}
+327
View File
@@ -0,0 +1,327 @@
// Package push hosts the backend gRPC SubscribePush server and the
// publisher API consumed by other backend domains.
//
// Service implements pushv1.PushServer. It maintains:
//
// - a connection registry keyed by GatewaySubscribeRequest.gateway_client_id;
// - an in-memory ring buffer of recent PushEvent values with TTL equal
// to BACKEND_FRESHNESS_WINDOW;
// - a monotonic cursor generator stamped on every published event.
//
// Publisher methods (PublishClientEvent, PublishSessionInvalidation)
// satisfy the SessionInvalidator interface in internal/auth and the
// PushPublisher interface in internal/notification — main.go injects
// a single *Service into both wiring sites.
//
// See `backend/README.md` §7 and `backend/docs/flows.md` for cursor,
// ring buffer, and backpressure semantics.
package push
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"sync"
"time"
"galaxy/backend/internal/telemetry"
pushv1 "galaxy/backend/proto/push/v1"
"github.com/google/uuid"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.uber.org/zap"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// Default sizing for the ring buffer and per-connection delivery queue.
// The values are intentionally hard-coded: ring TTL is the operational
// dial (BACKEND_FRESHNESS_WINDOW) and the buffer sizes are chosen to
// comfortably absorb a freshness window of traffic at MVP rates.
const (
defaultRingCapacity = 1024
defaultPerConnBuffer = 256
)
// ServiceConfig configures a Service. FreshnessWindow is required and
// fixes the ring buffer's per-event TTL. RingCapacity and PerConnBuffer
// fall back to the package defaults when zero. Now overrides time.Now
// for deterministic tests.
type ServiceConfig struct {
FreshnessWindow time.Duration
RingCapacity int
PerConnBuffer int
Now func() time.Time
}
// Service implements pushv1.PushServer and exposes the publisher API.
// One Service is shared by every backend domain that needs to push;
// it is safe for concurrent use.
type Service struct {
pushv1.UnimplementedPushServer
logger *zap.Logger
now func() time.Time
perConnBuffer int
mu sync.Mutex
closed bool
subs map[string]*subscription
ring *ring
cursorGen cursorGenerator
eventsTotal metric.Int64Counter
droppedTotal metric.Int64Counter
}
// NewService constructs a Service. A nil logger falls back to
// zap.NewNop. A nil runtime disables metric emission so tests can
// instantiate the service without the OpenTelemetry runtime.
func NewService(cfg ServiceConfig, logger *zap.Logger, runtime *telemetry.Runtime) (*Service, error) {
if cfg.FreshnessWindow <= 0 {
return nil, errors.New("push.NewService: FreshnessWindow must be positive")
}
if logger == nil {
logger = zap.NewNop()
}
if cfg.Now == nil {
cfg.Now = time.Now
}
if cfg.RingCapacity <= 0 {
cfg.RingCapacity = defaultRingCapacity
}
if cfg.PerConnBuffer <= 0 {
cfg.PerConnBuffer = defaultPerConnBuffer
}
s := &Service{
logger: logger.Named("push"),
now: cfg.Now,
perConnBuffer: cfg.PerConnBuffer,
subs: make(map[string]*subscription),
ring: newRing(cfg.RingCapacity, cfg.FreshnessWindow),
}
if runtime != nil {
if err := s.registerMetrics(runtime); err != nil {
return nil, fmt.Errorf("push.NewService: register metrics: %w", err)
}
}
return s, nil
}
// Close drops every active subscription and refuses new ones. It is
// safe to call multiple times. The owning Server must call Close before
// initiating GracefulStop so streaming handlers exit promptly.
func (s *Service) Close() {
s.mu.Lock()
defer s.mu.Unlock()
if s.closed {
return
}
s.closed = true
for clientID, sub := range s.subs {
close(sub.done)
delete(s.subs, clientID)
}
}
// PublishClientEvent enqueues a ClientEvent for delivery. payload is
// marshalled to JSON; deviceSessionID is optional. eventID, requestID
// and traceID are correlation identifiers that gateway forwards
// verbatim into the signed client envelope (typically the producing
// route id, the originating client request id, and the trace id of the
// span that produced the event); empty strings are forwarded
// unchanged. The method satisfies notification.PushPublisher.
func (s *Service) PublishClientEvent(_ context.Context, userID uuid.UUID, deviceSessionID *uuid.UUID, kind string, payload map[string]any, eventID, requestID, traceID string) error {
if userID == uuid.Nil {
return errors.New("push.PublishClientEvent: userID is required")
}
if strings.TrimSpace(kind) == "" {
return errors.New("push.PublishClientEvent: kind is required")
}
encoded, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("push.PublishClientEvent: marshal payload: %w", err)
}
ev := &pushv1.PushEvent{
Kind: &pushv1.PushEvent_ClientEvent{
ClientEvent: &pushv1.ClientEvent{
UserId: userID.String(),
Kind: kind,
Payload: encoded,
EventId: eventID,
RequestId: requestID,
TraceId: traceID,
},
},
}
if deviceSessionID != nil {
ev.GetClientEvent().DeviceSessionId = deviceSessionID.String()
}
s.publish(ev, "client_event")
return nil
}
// PublishSessionInvalidation enqueues a SessionInvalidation event. It
// satisfies auth.SessionInvalidator. deviceSessionID may be uuid.Nil to
// invalidate every session of userID.
func (s *Service) PublishSessionInvalidation(_ context.Context, deviceSessionID, userID uuid.UUID, reason string) {
if userID == uuid.Nil {
s.logger.Warn("push session invalidation skipped: userID is required",
zap.String("device_session_id", deviceSessionID.String()),
zap.String("reason", reason),
)
return
}
ev := &pushv1.PushEvent{
Kind: &pushv1.PushEvent_SessionInvalidation{
SessionInvalidation: &pushv1.SessionInvalidation{
UserId: userID.String(),
Reason: reason,
},
},
}
if deviceSessionID != uuid.Nil {
ev.GetSessionInvalidation().DeviceSessionId = deviceSessionID.String()
}
s.publish(ev, "session_invalidation")
}
func (s *Service) publish(ev *pushv1.PushEvent, kindLabel string) {
s.mu.Lock()
defer s.mu.Unlock()
if s.closed {
return
}
cursor := s.cursorGen.next()
ev.Cursor = formatCursor(cursor)
s.ring.append(cursor, ev, s.now())
if s.eventsTotal != nil {
s.eventsTotal.Add(context.Background(), 1, metric.WithAttributes(attribute.String("kind", kindLabel)))
}
for clientID, sub := range s.subs {
if dropped := sub.deliver(ev); dropped {
if s.droppedTotal != nil {
s.droppedTotal.Add(context.Background(), 1, metric.WithAttributes(attribute.String("gateway_client_id", clientID)))
}
s.logger.Warn("push subscription dropped event",
zap.String("gateway_client_id", clientID),
zap.String("cursor", ev.Cursor),
zap.String("event_kind", kindLabel),
)
}
}
}
// register installs a new subscription for clientID and returns the
// replay slice the caller must send before draining the live channel.
// An existing subscription for the same clientID is closed first so
// the previous reader goroutine exits.
func (s *Service) register(clientID, cursor string) (*subscription, []*pushv1.PushEvent, error) {
s.mu.Lock()
defer s.mu.Unlock()
if s.closed {
return nil, nil, status.Error(codes.Unavailable, "push service stopped")
}
if existing, ok := s.subs[clientID]; ok {
close(existing.done)
delete(s.subs, clientID)
s.logger.Info("push subscription replaced",
zap.String("gateway_client_id", clientID),
)
}
sub := &subscription{
clientID: clientID,
ch: make(chan *pushv1.PushEvent, s.perConnBuffer),
done: make(chan struct{}),
}
s.subs[clientID] = sub
from, ok := parseCursor(cursor)
if !ok {
s.logger.Warn("push subscribe with malformed cursor; resuming from live tail",
zap.String("gateway_client_id", clientID),
zap.String("cursor", cursor),
)
}
replay, stale := s.ring.since(from, s.now())
if stale {
s.logger.Info("push subscribe cursor stale; replay skipped",
zap.String("gateway_client_id", clientID),
zap.String("cursor", cursor),
)
} else if len(replay) > 0 {
s.logger.Info("push subscribe replay",
zap.String("gateway_client_id", clientID),
zap.String("cursor", cursor),
zap.Int("events", len(replay)),
)
}
return sub, replay, nil
}
// unregister removes sub from the registry when the reader goroutine
// exits. It is a no-op when sub has already been replaced — the
// replacement subscription owns the entry under the same clientID.
func (s *Service) unregister(sub *subscription) {
s.mu.Lock()
defer s.mu.Unlock()
if cur, ok := s.subs[sub.clientID]; ok && cur == sub {
delete(s.subs, sub.clientID)
}
}
// SubscriberCount reports the number of active subscriptions; used by
// metrics callbacks and tests.
func (s *Service) SubscriberCount() int {
s.mu.Lock()
defer s.mu.Unlock()
return len(s.subs)
}
func (s *Service) registerMetrics(runtime *telemetry.Runtime) error {
meter := runtime.MeterProvider().Meter("galaxy.backend/push")
subscribers, err := meter.Int64ObservableGauge(
"grpc_push_subscribers",
metric.WithDescription("Number of gateway clients currently subscribed to the backend push stream."),
metric.WithUnit("1"),
)
if err != nil {
return err
}
if _, err := meter.RegisterCallback(func(_ context.Context, o metric.Observer) error {
o.ObserveInt64(subscribers, int64(s.SubscriberCount()))
return nil
}, subscribers); err != nil {
return err
}
eventsTotal, err := meter.Int64Counter(
"grpc_push_events_total",
metric.WithDescription("Number of push events published, partitioned by event kind."),
metric.WithUnit("1"),
)
if err != nil {
return err
}
s.eventsTotal = eventsTotal
droppedTotal, err := meter.Int64Counter(
"grpc_push_dropped_total",
metric.WithDescription("Number of push events dropped because a subscriber buffer was full, partitioned by gateway client id."),
metric.WithUnit("1"),
)
if err != nil {
return err
}
s.droppedTotal = droppedTotal
return nil
}
+240
View File
@@ -0,0 +1,240 @@
package push
import (
"context"
"net"
"testing"
"time"
pushv1 "galaxy/backend/proto/push/v1"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/status"
"google.golang.org/grpc/test/bufconn"
)
const bufconnBufferSize = 1024 * 1024
// startBufconnServer wires svc into an in-process gRPC server reachable
// through a bufconn dialer. The returned cleanup function stops the
// server and closes the listener.
func startBufconnServer(t *testing.T, svc *Service) (pushv1.PushClient, func()) {
t.Helper()
lis := bufconn.Listen(bufconnBufferSize)
server := grpc.NewServer()
pushv1.RegisterPushServer(server, svc)
go func() {
_ = server.Serve(lis)
}()
conn, err := grpc.NewClient(
"passthrough://bufnet",
grpc.WithContextDialer(func(_ context.Context, _ string) (net.Conn, error) {
return lis.DialContext(context.Background())
}),
grpc.WithTransportCredentials(insecure.NewCredentials()),
)
require.NoError(t, err)
cleanup := func() {
_ = conn.Close()
server.Stop()
_ = lis.Close()
}
return pushv1.NewPushClient(conn), cleanup
}
func recvOne(t *testing.T, stream pushv1.Push_SubscribePushClient, timeout time.Duration) (*pushv1.PushEvent, error) {
t.Helper()
type result struct {
ev *pushv1.PushEvent
err error
}
ch := make(chan result, 1)
go func() {
ev, err := stream.Recv()
ch <- result{ev, err}
}()
select {
case r := <-ch:
return r.ev, r.err
case <-time.After(timeout):
t.Fatalf("timed out waiting for push event after %s", timeout)
return nil, nil
}
}
func TestSubscribePushDeliversLiveEvents(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 16, PerConnBuffer: 8}, nil, nil)
require.NoError(t, err)
t.Cleanup(svc.Close)
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1"})
require.NoError(t, err)
require.Eventually(t, func() bool { return svc.SubscriberCount() == 1 }, time.Second, 5*time.Millisecond)
userID := uuid.New()
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "k", nil, "", "", ""))
ev, err := recvOne(t, stream, time.Second)
require.NoError(t, err)
assert.Equal(t, formatCursor(1), ev.Cursor)
assert.Equal(t, userID.String(), ev.GetClientEvent().UserId)
}
func TestSubscribePushReplaysPastEventsOnReconnect(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 16, PerConnBuffer: 8}, nil, nil)
require.NoError(t, err)
t.Cleanup(svc.Close)
userID := uuid.New()
for range 3 {
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "k", nil, "", "", ""))
}
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1", Cursor: formatCursor(1)})
require.NoError(t, err)
for i := uint64(2); i <= 3; i++ {
ev, err := recvOne(t, stream, time.Second)
require.NoError(t, err)
assert.Equal(t, formatCursor(i), ev.Cursor)
}
}
func TestSubscribePushSkipsReplayWhenCursorStale(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 2, PerConnBuffer: 8}, nil, nil)
require.NoError(t, err)
t.Cleanup(svc.Close)
userID := uuid.New()
for range 4 {
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "k", nil, "", "", ""))
}
// Ring capacity 2 means cursors 1 and 2 are evicted.
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1", Cursor: formatCursor(1)})
require.NoError(t, err)
require.Eventually(t, func() bool { return svc.SubscriberCount() == 1 }, time.Second, 5*time.Millisecond)
// Stale cursor → no replay; live publish must arrive.
require.NoError(t, svc.PublishClientEvent(context.Background(), userID, nil, "k", nil, "", "", ""))
ev, err := recvOne(t, stream, time.Second)
require.NoError(t, err)
assert.Equal(t, formatCursor(5), ev.Cursor)
}
func TestSubscribePushReplacesExistingClientID(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 8, PerConnBuffer: 8}, nil, nil)
require.NoError(t, err)
t.Cleanup(svc.Close)
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream1, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1"})
require.NoError(t, err)
require.Eventually(t, func() bool { return svc.SubscriberCount() == 1 }, time.Second, 5*time.Millisecond)
stream2, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1"})
require.NoError(t, err)
// First stream must terminate with Aborted.
_, err = recvOne(t, stream1, time.Second)
require.Error(t, err)
assert.Equal(t, codes.Aborted, status.Code(err))
// Subscriber count returns to one (the replacement).
require.Eventually(t, func() bool { return svc.SubscriberCount() == 1 }, time.Second, 5*time.Millisecond)
// Live publish reaches the replacement.
require.NoError(t, svc.PublishClientEvent(context.Background(), uuid.New(), nil, "k", nil, "", "", ""))
ev, err := recvOne(t, stream2, time.Second)
require.NoError(t, err)
assert.NotEmpty(t, ev.Cursor)
}
func TestSubscribePushRejectsEmptyClientID(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 4, PerConnBuffer: 4}, nil, nil)
require.NoError(t, err)
t.Cleanup(svc.Close)
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{})
require.NoError(t, err)
_, err = stream.Recv()
require.Error(t, err)
assert.Equal(t, codes.InvalidArgument, status.Code(err))
}
func TestSubscriptionDeliverDropsOldestOnOverflow(t *testing.T) {
t.Parallel()
sub := &subscription{
clientID: "gw-1",
ch: make(chan *pushv1.PushEvent, 2),
done: make(chan struct{}),
}
first := mkEvent(1, "a")
second := mkEvent(2, "b")
third := mkEvent(3, "c")
assert.False(t, sub.deliver(first))
assert.False(t, sub.deliver(second))
assert.True(t, sub.deliver(third), "third deliver must report a drop")
got1 := <-sub.ch
got2 := <-sub.ch
assert.Equal(t, second, got1, "oldest event (first) was dropped")
assert.Equal(t, third, got2)
}
func TestServiceCloseTerminatesActiveStream(t *testing.T) {
t.Parallel()
svc, err := NewService(ServiceConfig{FreshnessWindow: time.Minute, RingCapacity: 4, PerConnBuffer: 4}, nil, nil)
require.NoError(t, err)
client, cleanup := startBufconnServer(t, svc)
defer cleanup()
stream, err := client.SubscribePush(t.Context(), &pushv1.GatewaySubscribeRequest{GatewayClientId: "gw-1"})
require.NoError(t, err)
require.Eventually(t, func() bool { return svc.SubscriberCount() == 1 }, time.Second, 5*time.Millisecond)
svc.Close()
_, err = recvOne(t, stream, time.Second)
require.Error(t, err)
assert.Equal(t, codes.Aborted, status.Code(err))
}
+48
View File
@@ -0,0 +1,48 @@
package push
import (
"strings"
pushv1 "galaxy/backend/proto/push/v1"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// SubscribePush is the gRPC server handler. It registers the connection
// in the subscription registry, replays any in-buffer events newer than
// the requested cursor, and then streams live events until the client
// cancels, the subscription is replaced by a newer connection from the
// same gateway client id, or the Service is shut down.
func (s *Service) SubscribePush(req *pushv1.GatewaySubscribeRequest, stream grpc.ServerStreamingServer[pushv1.PushEvent]) error {
if req == nil || strings.TrimSpace(req.GetGatewayClientId()) == "" {
return status.Error(codes.InvalidArgument, "gateway_client_id is required")
}
sub, replay, err := s.register(req.GetGatewayClientId(), req.GetCursor())
if err != nil {
return err
}
defer s.unregister(sub)
for _, ev := range replay {
if err := stream.Send(ev); err != nil {
return err
}
}
ctx := stream.Context()
for {
select {
case <-ctx.Done():
return nil
case <-sub.done:
return status.Error(codes.Aborted, "push subscription replaced or service stopped")
case ev := <-sub.ch:
if err := stream.Send(ev); err != nil {
return err
}
}
}
}
+43
View File
@@ -0,0 +1,43 @@
package push
import (
pushv1 "galaxy/backend/proto/push/v1"
)
// subscription is the per-gateway-instance delivery queue. Each
// subscription owns a buffered channel; the publisher writes into it
// without blocking by dropping the oldest queued event when the buffer
// is full. The done channel is closed by the Service when the
// subscription is replaced (a new connection arrived for the same
// gateway_client_id) or when the Service is shutting down.
type subscription struct {
clientID string
ch chan *pushv1.PushEvent
done chan struct{}
dropped uint64
}
// deliver enqueues ev into the subscription's buffer. When the buffer
// is full, the oldest queued event is dropped to make room and the
// dropped counter increments. The bool reports whether a drop occurred,
// so the publisher can update its drop metric.
//
// The Service holds its mutex while calling deliver, which means at
// most one publisher writes to ch at a time. The reader goroutine runs
// independently and only consumes from ch, so the second send below is
// guaranteed not to block: after evicting the head, the channel has at
// least one free slot which no other publisher can fill.
func (s *subscription) deliver(ev *pushv1.PushEvent) bool {
select {
case s.ch <- ev:
return false
default:
}
select {
case <-s.ch:
default:
}
s.ch <- ev
s.dropped++
return true
}