feat: edge gateway service

This commit is contained in:
Ilia Denisov
2026-04-02 19:18:42 +02:00
committed by GitHub
parent 8cde99936c
commit 436c97a38b
95 changed files with 20504 additions and 57 deletions
+102
View File
@@ -0,0 +1,102 @@
// Package telemetry provides shared edge observability helpers used by the
// gateway transports and internal event consumers.
package telemetry
import (
"net/http"
"strings"
"google.golang.org/grpc/codes"
)
// EdgeOutcome is the stable low-cardinality outcome vocabulary shared by REST,
// gRPC, push shutdown, and observability backends.
type EdgeOutcome string
const (
EdgeOutcomeSuccess EdgeOutcome = "success"
EdgeOutcomeMalformedRequest EdgeOutcome = "malformed_request"
EdgeOutcomeRequestTooLarge EdgeOutcome = "request_too_large"
EdgeOutcomeUnsupportedProtocol EdgeOutcome = "unsupported_protocol"
EdgeOutcomeUnknownSession EdgeOutcome = "unknown_session"
EdgeOutcomeRevokedSession EdgeOutcome = "revoked_session"
EdgeOutcomeInvalidSignature EdgeOutcome = "invalid_signature"
EdgeOutcomeStaleRequest EdgeOutcome = "stale_request"
EdgeOutcomeReplayDetected EdgeOutcome = "replay_detected"
EdgeOutcomeRateLimited EdgeOutcome = "rate_limited"
EdgeOutcomePolicyDenied EdgeOutcome = "policy_denied"
EdgeOutcomeDownstreamUnavailable EdgeOutcome = "downstream_unavailable"
EdgeOutcomeBackendUnavailable EdgeOutcome = "backend_unavailable"
EdgeOutcomeInternalError EdgeOutcome = "internal_error"
EdgeOutcomeGatewayShuttingDown EdgeOutcome = "gateway_shutting_down"
)
// RejectReason returns the stable reject reason for outcome. Success does not
// produce a reject reason.
func RejectReason(outcome EdgeOutcome) string {
if outcome == EdgeOutcomeSuccess {
return ""
}
return string(outcome)
}
// OutcomeFromPublicErrorCode maps the stable public REST error envelope into
// the shared edge-outcome vocabulary.
func OutcomeFromPublicErrorCode(statusCode int, code string) EdgeOutcome {
switch strings.TrimSpace(code) {
case "":
if statusCode < http.StatusBadRequest {
return EdgeOutcomeSuccess
}
return EdgeOutcomeInternalError
case "invalid_request", "method_not_allowed", "not_found":
return EdgeOutcomeMalformedRequest
case "request_too_large":
return EdgeOutcomeRequestTooLarge
case "rate_limited":
return EdgeOutcomeRateLimited
case "service_unavailable":
return EdgeOutcomeBackendUnavailable
default:
if statusCode >= http.StatusInternalServerError {
return EdgeOutcomeInternalError
}
return EdgeOutcomeMalformedRequest
}
}
// OutcomeFromGRPCStatus maps the stable authenticated gRPC reject contract
// into the shared edge-outcome vocabulary.
func OutcomeFromGRPCStatus(code codes.Code, message string) EdgeOutcome {
switch {
case code == codes.OK:
return EdgeOutcomeSuccess
case code == codes.InvalidArgument:
return EdgeOutcomeMalformedRequest
case code == codes.FailedPrecondition && strings.Contains(message, "unsupported protocol_version"):
return EdgeOutcomeUnsupportedProtocol
case code == codes.Unauthenticated && message == "unknown device session":
return EdgeOutcomeUnknownSession
case code == codes.FailedPrecondition && message == "device session is revoked":
return EdgeOutcomeRevokedSession
case code == codes.Unauthenticated && message == "invalid request signature":
return EdgeOutcomeInvalidSignature
case code == codes.FailedPrecondition && message == "request timestamp is outside the freshness window":
return EdgeOutcomeStaleRequest
case code == codes.FailedPrecondition && message == "request replay detected":
return EdgeOutcomeReplayDetected
case code == codes.ResourceExhausted && message == "authenticated request rate limit exceeded":
return EdgeOutcomeRateLimited
case code == codes.PermissionDenied && message == "authenticated request rejected by edge policy":
return EdgeOutcomePolicyDenied
case code == codes.Unavailable && message == "downstream service is unavailable":
return EdgeOutcomeDownstreamUnavailable
case code == codes.Unavailable && message == "gateway is shutting down":
return EdgeOutcomeGatewayShuttingDown
case code == codes.Unavailable:
return EdgeOutcomeBackendUnavailable
default:
return EdgeOutcomeInternalError
}
}
+48
View File
@@ -0,0 +1,48 @@
package telemetry
import (
"context"
"errors"
"galaxy/gateway/internal/push"
"go.opentelemetry.io/otel/attribute"
)
// PushObserver adapts Runtime to the push.Observer interface.
type PushObserver struct {
runtime *Runtime
}
// NewPushObserver constructs a push stream observer backed by runtime.
func NewPushObserver(runtime *Runtime) *PushObserver {
return &PushObserver{runtime: runtime}
}
// Registered records one active push stream.
func (o *PushObserver) Registered(_ push.StreamBinding) {
if o == nil || o.runtime == nil {
return
}
o.runtime.AddActivePushStream(context.Background(), 1)
}
// Unregistered records one active-stream decrement and one closure reason for
// hub-enforced shutdown, overflow, or revocation.
func (o *PushObserver) Unregistered(_ push.StreamBinding, err error) {
if o == nil || o.runtime == nil {
return
}
o.runtime.AddActivePushStream(context.Background(), -1)
switch {
case errors.Is(err, push.ErrSubscriptionOverflow):
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "overflow"))
case errors.Is(err, push.ErrSubscriptionRevoked):
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "revoked"))
case errors.Is(err, push.ErrHubShuttingDown):
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "shutdown"))
}
}
+254
View File
@@ -0,0 +1,254 @@
package telemetry
import (
"context"
"errors"
"net/http"
"os"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
otelprom "go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.uber.org/zap"
)
const defaultServiceName = "galaxy-edge-gateway"
// Runtime owns the shared OpenTelemetry providers, the Prometheus metrics
// handler, and the custom low-cardinality edge instruments.
type Runtime struct {
logger *zap.Logger
tracerProvider *sdktrace.TracerProvider
meterProvider *sdkmetric.MeterProvider
promHandler http.Handler
// Public REST instruments.
publicRequests metric.Int64Counter
publicDuration metric.Float64Histogram
// Authenticated gRPC instruments.
grpcRequests metric.Int64Counter
grpcDuration metric.Float64Histogram
// Push instruments.
pushActiveStreams metric.Int64UpDownCounter
pushStreamClosers metric.Int64Counter
// Internal event consumer instruments.
internalEventDrops metric.Int64Counter
}
// New constructs the gateway telemetry runtime, registers global providers,
// and returns the Prometheus handler used by the admin listener.
func New(ctx context.Context, logger *zap.Logger) (*Runtime, error) {
if logger == nil {
logger = zap.NewNop()
}
serviceName := strings.TrimSpace(os.Getenv("OTEL_SERVICE_NAME"))
if serviceName == "" {
serviceName = defaultServiceName
}
res, err := resource.New(
ctx,
resource.WithAttributes(attribute.String("service.name", serviceName)),
)
if err != nil {
return nil, err
}
tracerProvider, err := newTracerProvider(ctx, res)
if err != nil {
return nil, err
}
registry := prometheus.NewRegistry()
exporter, err := otelprom.New(otelprom.WithRegisterer(registry))
if err != nil {
return nil, err
}
meterProvider := sdkmetric.NewMeterProvider(
sdkmetric.WithResource(res),
sdkmetric.WithReader(exporter),
)
otel.SetTracerProvider(tracerProvider)
otel.SetMeterProvider(meterProvider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
meter := meterProvider.Meter("galaxy/gateway")
publicRequests, err := meter.Int64Counter("gateway.public_http.requests")
if err != nil {
return nil, err
}
publicDuration, err := meter.Float64Histogram("gateway.public_http.duration", metric.WithUnit("ms"))
if err != nil {
return nil, err
}
grpcRequests, err := meter.Int64Counter("gateway.authenticated_grpc.requests")
if err != nil {
return nil, err
}
grpcDuration, err := meter.Float64Histogram("gateway.authenticated_grpc.duration", metric.WithUnit("ms"))
if err != nil {
return nil, err
}
pushActiveStreams, err := meter.Int64UpDownCounter("gateway.push.active_streams")
if err != nil {
return nil, err
}
pushStreamClosers, err := meter.Int64Counter("gateway.push.stream_closures")
if err != nil {
return nil, err
}
internalEventDrops, err := meter.Int64Counter("gateway.internal_event_drops")
if err != nil {
return nil, err
}
return &Runtime{
logger: logger,
tracerProvider: tracerProvider,
meterProvider: meterProvider,
promHandler: promhttp.HandlerFor(registry, promhttp.HandlerOpts{}),
publicRequests: publicRequests,
publicDuration: publicDuration,
grpcRequests: grpcRequests,
grpcDuration: grpcDuration,
pushActiveStreams: pushActiveStreams,
pushStreamClosers: pushStreamClosers,
internalEventDrops: internalEventDrops,
}, nil
}
// Handler returns the Prometheus handler that should be mounted on the admin
// listener.
func (r *Runtime) Handler() http.Handler {
if r == nil || r.promHandler == nil {
return http.NotFoundHandler()
}
return r.promHandler
}
// Shutdown flushes the configured telemetry providers.
func (r *Runtime) Shutdown(ctx context.Context) error {
if r == nil {
return nil
}
var shutdownErr error
if r.meterProvider != nil {
shutdownErr = errors.Join(shutdownErr, r.meterProvider.Shutdown(ctx))
}
if r.tracerProvider != nil {
shutdownErr = errors.Join(shutdownErr, r.tracerProvider.Shutdown(ctx))
}
return shutdownErr
}
// RecordPublicRequest records one public REST request outcome.
func (r *Runtime) RecordPublicRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.publicRequests.Add(ctx, 1, options)
r.publicDuration.Record(ctx, duration.Seconds()*1000, options)
}
// RecordAuthenticatedGRPC records one authenticated gRPC request or stream
// outcome.
func (r *Runtime) RecordAuthenticatedGRPC(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
if r == nil {
return
}
options := metric.WithAttributes(attrs...)
r.grpcRequests.Add(ctx, 1, options)
r.grpcDuration.Record(ctx, duration.Seconds()*1000, options)
}
// AddActivePushStream records one active-stream delta.
func (r *Runtime) AddActivePushStream(ctx context.Context, delta int64, attrs ...attribute.KeyValue) {
if r == nil {
return
}
r.pushActiveStreams.Add(ctx, delta, metric.WithAttributes(attrs...))
}
// RecordPushStreamClosure records one push-stream closure reason.
func (r *Runtime) RecordPushStreamClosure(ctx context.Context, attrs ...attribute.KeyValue) {
if r == nil {
return
}
r.pushStreamClosers.Add(ctx, 1, metric.WithAttributes(attrs...))
}
// RecordInternalEventDrop records one malformed or rejected internal event.
func (r *Runtime) RecordInternalEventDrop(ctx context.Context, attrs ...attribute.KeyValue) {
if r == nil {
return
}
r.internalEventDrops.Add(ctx, 1, metric.WithAttributes(attrs...))
}
func newTracerProvider(ctx context.Context, res *resource.Resource) (*sdktrace.TracerProvider, error) {
exporterName := strings.TrimSpace(os.Getenv("OTEL_TRACES_EXPORTER"))
if exporterName == "" || exporterName == "none" {
return sdktrace.NewTracerProvider(sdktrace.WithResource(res)), nil
}
if exporterName != "otlp" {
return nil, errors.New("unsupported OTEL_TRACES_EXPORTER value")
}
protocol := strings.TrimSpace(os.Getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"))
if protocol == "" {
protocol = strings.TrimSpace(os.Getenv("OTEL_EXPORTER_OTLP_PROTOCOL"))
}
var (
exporter sdktrace.SpanExporter
err error
)
switch protocol {
case "", "http/protobuf":
exporter, err = otlptracehttp.New(ctx)
case "grpc":
exporter, err = otlptracegrpc.New(ctx)
default:
return nil, errors.New("unsupported OTEL exporter protocol")
}
if err != nil {
return nil, err
}
return sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(res),
), nil
}