scrabble-game/gateway/cmd/gateway/main.go

// Command gateway is the Scrabble platform's only public ingress. It terminates
// the client's Connect-RPC/FlatBuffers traffic over h2c, validates platform /
// email / guest credentials and mints opaque sessions, rate-limits, injects
// X-User-ID when forwarding to the backend over REST, and bridges the backend's
// gRPC push stream to each client's in-app live channel. It also serves the
// backend's admin console at /_gm on the public listener behind HTTP Basic-Auth.
package main

import (
	"context"
	"errors"
	"log"
	"net/http"
	"os/signal"
	"syscall"
	"time"

	"go.uber.org/zap"

	"scrabble/gateway/internal/admin"
	"scrabble/gateway/internal/backendclient"
	"scrabble/gateway/internal/config"
	"scrabble/gateway/internal/connector"
	"scrabble/gateway/internal/connectsrv"
	"scrabble/gateway/internal/push"
	"scrabble/gateway/internal/ratelimit"
	"scrabble/gateway/internal/session"
	"scrabble/gateway/internal/transcode"
	pkgtel "scrabble/pkg/telemetry"
)

const (
	// shutdownTimeout bounds the graceful HTTP shutdown.
	shutdownTimeout = 10 * time.Second
	// telemetryShutdownTimeout bounds the OpenTelemetry flush during process exit.
	telemetryShutdownTimeout = 5 * time.Second
	// pushReconnectDelay is the pause before re-subscribing to the backend push
	// stream after it ends.
	pushReconnectDelay = 2 * time.Second
	// gatewayID identifies this gateway instance to the backend push channel.
	gatewayID = "gateway"
	// readHeaderTimeout bounds reading one request's headers on the public
	// listener (a slowloris guard). Bodies and long-lived streams are governed by
	// the h2c settings in connectsrv — Read/WriteTimeout stay unset on purpose,
	// they would kill the Subscribe stream (R3).
	readHeaderTimeout = 10 * time.Second
	// throttleReportInterval is the cadence of the rate-limiter rejection
	// summary: the Warn log per throttled key and the report to the backend (R3).
	throttleReportInterval = 30 * time.Second
)

func main() {
	cfg, err := config.Load()
	if err != nil {
		log.Fatalf("gateway: load config: %v", err)
	}
	logger, err := newLogger(cfg.LogLevel)
	if err != nil {
		log.Fatalf("gateway: build logger: %v", err)
	}
	defer func() { _ = logger.Sync() }()

	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
	defer stop()

	if err := run(ctx, cfg, logger); err != nil {
		logger.Fatal("gateway: terminated", zap.Error(err))
	}
}

// run wires the gateway dependencies and serves the public listener (which also
// fronts the admin console at /_gm) until the context is cancelled.
func run(ctx context.Context, cfg config.Config, logger *zap.Logger) error {
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	tel, err := pkgtel.New(ctx, cfg.Telemetry)
	if err != nil {
		return err
	}
	defer func() {
		shutdownCtx, sc := context.WithTimeout(context.Background(), telemetryShutdownTimeout)
		defer sc()
		if err := tel.Shutdown(shutdownCtx); err != nil {
			logger.Warn("telemetry shutdown", zap.Error(err))
		}
	}()
	if err := tel.StartRuntimeMetrics(); err != nil {
		logger.Warn("telemetry: start runtime metrics", zap.Error(err))
	}

	backend, err := backendclient.New(cfg.BackendHTTPURL, cfg.BackendGRPCAddr, cfg.BackendTimeout)
	if err != nil {
		return err
	}
	defer func() { _ = backend.Close() }()

	sessions := session.NewCache(backend, cfg.SessionTTL, cfg.SessionCacheMax)
	limiter := ratelimit.New()
	tracker := ratelimit.NewTracker()
	hub := push.NewHub(0)

	var conn *connector.Client
	var validator transcode.TelegramValidator
	if cfg.ConnectorAddr != "" {
		conn, err = connector.New(cfg.ConnectorAddr)
		if err != nil {
			return err
		}
		defer func() { _ = conn.Close() }()
		validator = conn
	} else {
		logger.Warn("telegram disabled (GATEWAY_CONNECTOR_ADDR unset)")
	}

	// The admin console (backend /_gm) is fronted on the public listener behind
	// Basic-Auth, enabled when both credentials are set; it is mounted on the edge
	// mux so the Connect h2c handler stays the top-level handler.
	var adminProxy http.Handler
	if cfg.AdminEnabled() {
		adminProxy, err = admin.NewProxy(cfg.BackendHTTPURL, cfg.AdminUser, cfg.AdminPassword, logger)
		if err != nil {
			return err
		}
	} else {
		logger.Info("admin console disabled (set GATEWAY_ADMIN_USER and GATEWAY_ADMIN_PASSWORD)")
	}

	registry := transcode.NewRegistry(backend, validator, cfg.DefaultSupportedLanguages...)
	edge := connectsrv.NewServer(connectsrv.Deps{
		Registry:     registry,
		Sessions:     sessions,
		Limiter:      limiter,
		Tracker:      tracker,
		Hub:          hub,
		RateLimit:    cfg.RateLimit,
		Heartbeat:    cfg.PushHeartbeatInterval,
		Logger:       logger,
		AdminProxy:   adminProxy,
		Meter:        tel.MeterProvider().Meter("scrabble/gateway/edge"),
		MaxBodyBytes: cfg.MaxBodyBytes,
	})

	// Bridge the backend push stream into the fan-out hub (and the out-of-app
	// channel via the connector).
	go runPushPump(ctx, backend, hub, conn, logger)
	// Periodically summarise rate-limiter rejections (Warn log + backend report).
	go runThrottleReporter(ctx, tracker, backend, logger)

	public := &http.Server{Addr: cfg.HTTPAddr, Handler: edge.HTTPHandler(), ReadHeaderTimeout: readHeaderTimeout}
	servers := []*namedServer{{name: "public", srv: public}}

	logger.Info("gateway starting",
		zap.String("http_addr", cfg.HTTPAddr),
		zap.String("backend_http", cfg.BackendHTTPURL),
		zap.String("backend_grpc", cfg.BackendGRPCAddr))
	return runServers(ctx, cancel, servers, logger)
}

// namedServer pairs an HTTP server with a label for diagnostics.
type namedServer struct {
	name string
	srv  *http.Server
}

// runServers serves every listener and shuts them all down when the first one
// stops or the context is cancelled.
func runServers(ctx context.Context, cancel context.CancelFunc, servers []*namedServer, logger *zap.Logger) error {
	errc := make(chan error, len(servers))
	for _, s := range servers {
		go func(s *namedServer) {
			logger.Info("listener starting", zap.String("server", s.name), zap.String("addr", s.srv.Addr))
			err := s.srv.ListenAndServe()
			if errors.Is(err, http.ErrServerClosed) {
				err = nil
			}
			errc <- err
		}(s)
	}

	var first error
	select {
	case <-ctx.Done():
	case first = <-errc:
	}
	cancel()

	shutdownCtx, sc := context.WithTimeout(context.Background(), shutdownTimeout)
	defer sc()
	for _, s := range servers {
		if err := s.srv.Shutdown(shutdownCtx); err != nil {
			logger.Warn("listener shutdown", zap.String("server", s.name), zap.Error(err))
		}
	}
	return first
}

// runThrottleReporter drains the rate-limiter rejection tracker on a fixed
// cadence, emits one Warn summary per throttled key and forwards the report to
// the backend (which feeds the admin throttled view and the high-rate
// auto-flag), until the context is done. A failed delivery is logged and
// dropped — the next window reports fresh data anyway.
func runThrottleReporter(ctx context.Context, tracker *ratelimit.Tracker, backend *backendclient.Client, logger *zap.Logger) {
	ticker := time.NewTicker(throttleReportInterval)
	defer ticker.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
		}
		entries := tracker.Drain()
		if len(entries) == 0 {
			continue
		}
		for _, e := range entries {
			logger.Warn("rate limited",
				zap.String("class", e.Class),
				zap.String("key", e.Key),
				zap.Int("rejected", e.Rejected),
				zap.Duration("window", throttleReportInterval))
		}
		if err := backend.ReportRateLimited(ctx, int(throttleReportInterval.Seconds()), entries); err != nil {
			logger.Warn("rate-limit report failed", zap.Error(err))
		}
	}
}

// runPushPump keeps a backend push subscription open, forwarding every event to
// the hub and re-subscribing after the stream ends, until the context is done. For
// the out-of-app push kinds it also routes events whose recipient has no live
// in-app stream to the platform connector (a nil connector disables that channel).
func runPushPump(ctx context.Context, backend *backendclient.Client, hub *push.Hub, conn *connector.Client, logger *zap.Logger) {
	for ctx.Err() == nil {
		stream, err := backend.SubscribePush(ctx, gatewayID)
		if err != nil {
			logger.Warn("push subscribe failed", zap.Error(err))
			if !sleep(ctx, pushReconnectDelay) {
				return
			}
			continue
		}
		for {
			ev, err := stream.Recv()
			if err != nil {
				if ctx.Err() == nil {
					logger.Warn("push stream ended", zap.Error(err))
				}
				break
			}
			hub.Publish(push.Event{
				UserID:  ev.GetUserId(),
				Kind:    ev.GetKind(),
				Payload: ev.GetPayload(),
				EventID: ev.GetEventId(),
			})
			// Out-of-app fallback: when the recipient has no live in-app stream,
			// deliver the event over the platform push channel. Done in a goroutine
			// so a slow connector never stalls the in-app firehose.
			if conn != nil && connector.OutOfAppKind(ev.GetKind()) && !hub.HasSubscribers(ev.GetUserId()) {
				go deliverOutOfApp(ctx, backend, conn, ev.GetUserId(), ev.GetKind(), ev.GetPayload(), ev.GetLanguage(), logger)
			}
		}
		if !sleep(ctx, pushReconnectDelay) {
			return
		}
	}
}

// deliverOutOfApp resolves the recipient's push target and, when they have a
// Telegram identity and have not confined notifications to the app, asks the
// connector to deliver the event. It is best-effort: every failure is logged and
// dropped (the in-app stream remains the primary channel).
func deliverOutOfApp(ctx context.Context, backend *backendclient.Client, conn *connector.Client, userID, kind string, payload []byte, gameLang string, logger *zap.Logger) {
	target, err := backend.PushTarget(ctx, userID)
	if err != nil {
		logger.Warn("push target lookup failed", zap.String("user_id", userID), zap.Error(err))
		return
	}
	if !connector.DeliverToTarget(target.ExternalID, target.NotificationsInAppOnly) {
		return
	}
	// A game event carries its own language, so the push comes from the game's bot rather than
	// the recipient's last-login bot (Stage 17); other events fall back to the service language.
	lang := target.Language
	if gameLang != "" {
		lang = gameLang
	}
	if _, err := conn.Notify(ctx, target.ExternalID, kind, payload, lang); err != nil {
		logger.Warn("out-of-app notify failed", zap.String("kind", kind), zap.Error(err))
	}
}

// sleep waits for d or until ctx is cancelled, reporting whether it waited the
// full duration.
func sleep(ctx context.Context, d time.Duration) bool {
	t := time.NewTimer(d)
	defer t.Stop()
	select {
	case <-ctx.Done():
		return false
	case <-t.C:
		return true
	}
}

// newLogger builds a production JSON logger at the given level.
func newLogger(level string) (*zap.Logger, error) {
	var lvl zap.AtomicLevel
	if err := lvl.UnmarshalText([]byte(level)); err != nil {
		return nil, err
	}
	cfg := zap.NewProductionConfig()
	cfg.Level = lvl
	return cfg.Build()
}