feat: edge gateway service
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
// Package telemetry provides shared edge observability helpers used by the
|
||||
// gateway transports and internal event consumers.
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"google.golang.org/grpc/codes"
|
||||
)
|
||||
|
||||
// EdgeOutcome is the stable low-cardinality outcome vocabulary shared by REST,
|
||||
// gRPC, push shutdown, and observability backends.
|
||||
type EdgeOutcome string
|
||||
|
||||
const (
|
||||
EdgeOutcomeSuccess EdgeOutcome = "success"
|
||||
EdgeOutcomeMalformedRequest EdgeOutcome = "malformed_request"
|
||||
EdgeOutcomeRequestTooLarge EdgeOutcome = "request_too_large"
|
||||
EdgeOutcomeUnsupportedProtocol EdgeOutcome = "unsupported_protocol"
|
||||
EdgeOutcomeUnknownSession EdgeOutcome = "unknown_session"
|
||||
EdgeOutcomeRevokedSession EdgeOutcome = "revoked_session"
|
||||
EdgeOutcomeInvalidSignature EdgeOutcome = "invalid_signature"
|
||||
EdgeOutcomeStaleRequest EdgeOutcome = "stale_request"
|
||||
EdgeOutcomeReplayDetected EdgeOutcome = "replay_detected"
|
||||
EdgeOutcomeRateLimited EdgeOutcome = "rate_limited"
|
||||
EdgeOutcomePolicyDenied EdgeOutcome = "policy_denied"
|
||||
EdgeOutcomeDownstreamUnavailable EdgeOutcome = "downstream_unavailable"
|
||||
EdgeOutcomeBackendUnavailable EdgeOutcome = "backend_unavailable"
|
||||
EdgeOutcomeInternalError EdgeOutcome = "internal_error"
|
||||
EdgeOutcomeGatewayShuttingDown EdgeOutcome = "gateway_shutting_down"
|
||||
)
|
||||
|
||||
// RejectReason returns the stable reject reason for outcome. Success does not
|
||||
// produce a reject reason.
|
||||
func RejectReason(outcome EdgeOutcome) string {
|
||||
if outcome == EdgeOutcomeSuccess {
|
||||
return ""
|
||||
}
|
||||
|
||||
return string(outcome)
|
||||
}
|
||||
|
||||
// OutcomeFromPublicErrorCode maps the stable public REST error envelope into
|
||||
// the shared edge-outcome vocabulary.
|
||||
func OutcomeFromPublicErrorCode(statusCode int, code string) EdgeOutcome {
|
||||
switch strings.TrimSpace(code) {
|
||||
case "":
|
||||
if statusCode < http.StatusBadRequest {
|
||||
return EdgeOutcomeSuccess
|
||||
}
|
||||
return EdgeOutcomeInternalError
|
||||
case "invalid_request", "method_not_allowed", "not_found":
|
||||
return EdgeOutcomeMalformedRequest
|
||||
case "request_too_large":
|
||||
return EdgeOutcomeRequestTooLarge
|
||||
case "rate_limited":
|
||||
return EdgeOutcomeRateLimited
|
||||
case "service_unavailable":
|
||||
return EdgeOutcomeBackendUnavailable
|
||||
default:
|
||||
if statusCode >= http.StatusInternalServerError {
|
||||
return EdgeOutcomeInternalError
|
||||
}
|
||||
return EdgeOutcomeMalformedRequest
|
||||
}
|
||||
}
|
||||
|
||||
// OutcomeFromGRPCStatus maps the stable authenticated gRPC reject contract
|
||||
// into the shared edge-outcome vocabulary.
|
||||
func OutcomeFromGRPCStatus(code codes.Code, message string) EdgeOutcome {
|
||||
switch {
|
||||
case code == codes.OK:
|
||||
return EdgeOutcomeSuccess
|
||||
case code == codes.InvalidArgument:
|
||||
return EdgeOutcomeMalformedRequest
|
||||
case code == codes.FailedPrecondition && strings.Contains(message, "unsupported protocol_version"):
|
||||
return EdgeOutcomeUnsupportedProtocol
|
||||
case code == codes.Unauthenticated && message == "unknown device session":
|
||||
return EdgeOutcomeUnknownSession
|
||||
case code == codes.FailedPrecondition && message == "device session is revoked":
|
||||
return EdgeOutcomeRevokedSession
|
||||
case code == codes.Unauthenticated && message == "invalid request signature":
|
||||
return EdgeOutcomeInvalidSignature
|
||||
case code == codes.FailedPrecondition && message == "request timestamp is outside the freshness window":
|
||||
return EdgeOutcomeStaleRequest
|
||||
case code == codes.FailedPrecondition && message == "request replay detected":
|
||||
return EdgeOutcomeReplayDetected
|
||||
case code == codes.ResourceExhausted && message == "authenticated request rate limit exceeded":
|
||||
return EdgeOutcomeRateLimited
|
||||
case code == codes.PermissionDenied && message == "authenticated request rejected by edge policy":
|
||||
return EdgeOutcomePolicyDenied
|
||||
case code == codes.Unavailable && message == "downstream service is unavailable":
|
||||
return EdgeOutcomeDownstreamUnavailable
|
||||
case code == codes.Unavailable && message == "gateway is shutting down":
|
||||
return EdgeOutcomeGatewayShuttingDown
|
||||
case code == codes.Unavailable:
|
||||
return EdgeOutcomeBackendUnavailable
|
||||
default:
|
||||
return EdgeOutcomeInternalError
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"galaxy/gateway/internal/push"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// PushObserver adapts Runtime to the push.Observer interface.
|
||||
type PushObserver struct {
|
||||
runtime *Runtime
|
||||
}
|
||||
|
||||
// NewPushObserver constructs a push stream observer backed by runtime.
|
||||
func NewPushObserver(runtime *Runtime) *PushObserver {
|
||||
return &PushObserver{runtime: runtime}
|
||||
}
|
||||
|
||||
// Registered records one active push stream.
|
||||
func (o *PushObserver) Registered(_ push.StreamBinding) {
|
||||
if o == nil || o.runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
o.runtime.AddActivePushStream(context.Background(), 1)
|
||||
}
|
||||
|
||||
// Unregistered records one active-stream decrement and one closure reason for
|
||||
// hub-enforced shutdown, overflow, or revocation.
|
||||
func (o *PushObserver) Unregistered(_ push.StreamBinding, err error) {
|
||||
if o == nil || o.runtime == nil {
|
||||
return
|
||||
}
|
||||
|
||||
o.runtime.AddActivePushStream(context.Background(), -1)
|
||||
|
||||
switch {
|
||||
case errors.Is(err, push.ErrSubscriptionOverflow):
|
||||
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "overflow"))
|
||||
case errors.Is(err, push.ErrSubscriptionRevoked):
|
||||
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "revoked"))
|
||||
case errors.Is(err, push.ErrHubShuttingDown):
|
||||
o.runtime.RecordPushStreamClosure(context.Background(), attribute.String("reason", "shutdown"))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,254 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
otelprom "go.opentelemetry.io/otel/exporters/prometheus"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
const defaultServiceName = "galaxy-edge-gateway"
|
||||
|
||||
// Runtime owns the shared OpenTelemetry providers, the Prometheus metrics
|
||||
// handler, and the custom low-cardinality edge instruments.
|
||||
type Runtime struct {
|
||||
logger *zap.Logger
|
||||
|
||||
tracerProvider *sdktrace.TracerProvider
|
||||
meterProvider *sdkmetric.MeterProvider
|
||||
promHandler http.Handler
|
||||
|
||||
// Public REST instruments.
|
||||
publicRequests metric.Int64Counter
|
||||
publicDuration metric.Float64Histogram
|
||||
|
||||
// Authenticated gRPC instruments.
|
||||
grpcRequests metric.Int64Counter
|
||||
grpcDuration metric.Float64Histogram
|
||||
|
||||
// Push instruments.
|
||||
pushActiveStreams metric.Int64UpDownCounter
|
||||
pushStreamClosers metric.Int64Counter
|
||||
|
||||
// Internal event consumer instruments.
|
||||
internalEventDrops metric.Int64Counter
|
||||
}
|
||||
|
||||
// New constructs the gateway telemetry runtime, registers global providers,
|
||||
// and returns the Prometheus handler used by the admin listener.
|
||||
func New(ctx context.Context, logger *zap.Logger) (*Runtime, error) {
|
||||
if logger == nil {
|
||||
logger = zap.NewNop()
|
||||
}
|
||||
|
||||
serviceName := strings.TrimSpace(os.Getenv("OTEL_SERVICE_NAME"))
|
||||
if serviceName == "" {
|
||||
serviceName = defaultServiceName
|
||||
}
|
||||
|
||||
res, err := resource.New(
|
||||
ctx,
|
||||
resource.WithAttributes(attribute.String("service.name", serviceName)),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tracerProvider, err := newTracerProvider(ctx, res)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
registry := prometheus.NewRegistry()
|
||||
exporter, err := otelprom.New(otelprom.WithRegisterer(registry))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
meterProvider := sdkmetric.NewMeterProvider(
|
||||
sdkmetric.WithResource(res),
|
||||
sdkmetric.WithReader(exporter),
|
||||
)
|
||||
|
||||
otel.SetTracerProvider(tracerProvider)
|
||||
otel.SetMeterProvider(meterProvider)
|
||||
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||
propagation.TraceContext{},
|
||||
propagation.Baggage{},
|
||||
))
|
||||
|
||||
meter := meterProvider.Meter("galaxy/gateway")
|
||||
|
||||
publicRequests, err := meter.Int64Counter("gateway.public_http.requests")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
publicDuration, err := meter.Float64Histogram("gateway.public_http.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
grpcRequests, err := meter.Int64Counter("gateway.authenticated_grpc.requests")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
grpcDuration, err := meter.Float64Histogram("gateway.authenticated_grpc.duration", metric.WithUnit("ms"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pushActiveStreams, err := meter.Int64UpDownCounter("gateway.push.active_streams")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pushStreamClosers, err := meter.Int64Counter("gateway.push.stream_closures")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
internalEventDrops, err := meter.Int64Counter("gateway.internal_event_drops")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Runtime{
|
||||
logger: logger,
|
||||
tracerProvider: tracerProvider,
|
||||
meterProvider: meterProvider,
|
||||
promHandler: promhttp.HandlerFor(registry, promhttp.HandlerOpts{}),
|
||||
publicRequests: publicRequests,
|
||||
publicDuration: publicDuration,
|
||||
grpcRequests: grpcRequests,
|
||||
grpcDuration: grpcDuration,
|
||||
pushActiveStreams: pushActiveStreams,
|
||||
pushStreamClosers: pushStreamClosers,
|
||||
internalEventDrops: internalEventDrops,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handler returns the Prometheus handler that should be mounted on the admin
|
||||
// listener.
|
||||
func (r *Runtime) Handler() http.Handler {
|
||||
if r == nil || r.promHandler == nil {
|
||||
return http.NotFoundHandler()
|
||||
}
|
||||
|
||||
return r.promHandler
|
||||
}
|
||||
|
||||
// Shutdown flushes the configured telemetry providers.
|
||||
func (r *Runtime) Shutdown(ctx context.Context) error {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var shutdownErr error
|
||||
if r.meterProvider != nil {
|
||||
shutdownErr = errors.Join(shutdownErr, r.meterProvider.Shutdown(ctx))
|
||||
}
|
||||
if r.tracerProvider != nil {
|
||||
shutdownErr = errors.Join(shutdownErr, r.tracerProvider.Shutdown(ctx))
|
||||
}
|
||||
|
||||
return shutdownErr
|
||||
}
|
||||
|
||||
// RecordPublicRequest records one public REST request outcome.
|
||||
func (r *Runtime) RecordPublicRequest(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
r.publicRequests.Add(ctx, 1, options)
|
||||
r.publicDuration.Record(ctx, duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// RecordAuthenticatedGRPC records one authenticated gRPC request or stream
|
||||
// outcome.
|
||||
func (r *Runtime) RecordAuthenticatedGRPC(ctx context.Context, attrs []attribute.KeyValue, duration time.Duration) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
|
||||
options := metric.WithAttributes(attrs...)
|
||||
r.grpcRequests.Add(ctx, 1, options)
|
||||
r.grpcDuration.Record(ctx, duration.Seconds()*1000, options)
|
||||
}
|
||||
|
||||
// AddActivePushStream records one active-stream delta.
|
||||
func (r *Runtime) AddActivePushStream(ctx context.Context, delta int64, attrs ...attribute.KeyValue) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.pushActiveStreams.Add(ctx, delta, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
// RecordPushStreamClosure records one push-stream closure reason.
|
||||
func (r *Runtime) RecordPushStreamClosure(ctx context.Context, attrs ...attribute.KeyValue) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.pushStreamClosers.Add(ctx, 1, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
// RecordInternalEventDrop records one malformed or rejected internal event.
|
||||
func (r *Runtime) RecordInternalEventDrop(ctx context.Context, attrs ...attribute.KeyValue) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
|
||||
r.internalEventDrops.Add(ctx, 1, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func newTracerProvider(ctx context.Context, res *resource.Resource) (*sdktrace.TracerProvider, error) {
|
||||
exporterName := strings.TrimSpace(os.Getenv("OTEL_TRACES_EXPORTER"))
|
||||
if exporterName == "" || exporterName == "none" {
|
||||
return sdktrace.NewTracerProvider(sdktrace.WithResource(res)), nil
|
||||
}
|
||||
|
||||
if exporterName != "otlp" {
|
||||
return nil, errors.New("unsupported OTEL_TRACES_EXPORTER value")
|
||||
}
|
||||
|
||||
protocol := strings.TrimSpace(os.Getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"))
|
||||
if protocol == "" {
|
||||
protocol = strings.TrimSpace(os.Getenv("OTEL_EXPORTER_OTLP_PROTOCOL"))
|
||||
}
|
||||
|
||||
var (
|
||||
exporter sdktrace.SpanExporter
|
||||
err error
|
||||
)
|
||||
switch protocol {
|
||||
case "", "http/protobuf":
|
||||
exporter, err = otlptracehttp.New(ctx)
|
||||
case "grpc":
|
||||
exporter, err = otlptracegrpc.New(ctx)
|
||||
default:
|
||||
return nil, errors.New("unsupported OTEL exporter protocol")
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return sdktrace.NewTracerProvider(
|
||||
sdktrace.WithBatcher(exporter),
|
||||
sdktrace.WithResource(res),
|
||||
), nil
|
||||
}
|
||||
Reference in New Issue
Block a user