package worker import ( "context" "crypto/rand" "encoding/hex" "errors" "fmt" "log/slog" "strings" "time" "galaxy/notification/internal/adapters/redisstate" "galaxy/notification/internal/api/intentstream" "galaxy/notification/internal/logging" "galaxy/notification/internal/service/acceptintent" "galaxy/notification/internal/service/publishpush" ) const ( defaultPushPublisherPollInterval = 100 * time.Millisecond defaultPushPublisherBatchSize = 64 pushFailureClassificationPayloadEncoding = "payload_encoding_failed" pushFailureClassificationGatewayStreamWrite = "gateway_stream_publish_failed" ) // PushRouteStateStore describes the durable route-state operations required by // PushPublisher. type PushRouteStateStore interface { // ListDueRoutes loads due scheduled routes. ListDueRoutes(context.Context, time.Time, int64) ([]redisstate.ScheduledRoute, error) // TryAcquireRouteLease attempts to acquire one temporary route lease. TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error) // ReleaseRouteLease best-effort releases one temporary route lease. ReleaseRouteLease(context.Context, string, string, string) error // GetNotification loads one accepted notification. GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error) // GetRoute loads one accepted notification route. GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error) // CompleteRoutePublished records one successful publication. CompleteRoutePublished(context.Context, redisstate.CompleteRoutePublishedInput) error // CompleteRouteFailed records one retryable publication failure. CompleteRouteFailed(context.Context, redisstate.CompleteRouteFailedInput) error // CompleteRouteDeadLetter records one exhausted publication failure. CompleteRouteDeadLetter(context.Context, redisstate.CompleteRouteDeadLetterInput) error } // PushEventEncoder encodes one push-capable notification route into a // Gateway-compatible client event. type PushEventEncoder interface { // Encode converts notification plus route to one outbound event. Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishpush.Event, error) } // RoutePublisherTelemetry records low-cardinality route publication outcomes. type RoutePublisherTelemetry interface { // RecordRoutePublishAttempt records one route publication attempt outcome. RecordRoutePublishAttempt(context.Context, string, string, string, string) // RecordRouteRetry records one route retry scheduling event. RecordRouteRetry(context.Context, string, string) // RecordRouteDeadLetter records one route transition to dead_letter. RecordRouteDeadLetter(context.Context, string, string, string) } // PushPublisherConfig stores the dependencies and policies used by // PushPublisher. type PushPublisherConfig struct { // Store owns the durable route-state transitions. Store PushRouteStateStore // GatewayStream stores the outbound Gateway client-events stream name. GatewayStream string // GatewayStreamMaxLen bounds GatewayStream with approximate trimming. GatewayStreamMaxLen int64 // RouteLeaseTTL stores the temporary route-lease lifetime. RouteLeaseTTL time.Duration // RouteBackoffMin stores the minimum retry backoff. RouteBackoffMin time.Duration // RouteBackoffMax stores the maximum retry backoff. RouteBackoffMax time.Duration // PollInterval stores how long the worker waits before the next due-route // scan when no progress was made. PollInterval time.Duration // BatchSize stores the maximum number of due schedule members loaded per // scan. BatchSize int64 // Encoder stores the push payload encoder. Encoder PushEventEncoder // Telemetry records route publication counters. Telemetry RoutePublisherTelemetry // Clock provides wall-clock timestamps. Clock Clock } // PushPublisher publishes due push routes into the Gateway client-events // stream with retry and dead-letter handling. type PushPublisher struct { store PushRouteStateStore gatewayStream string gatewayStreamMaxLen int64 routeLeaseTTL time.Duration routeBackoffMin time.Duration routeBackoffMax time.Duration pollInterval time.Duration batchSize int64 encoder PushEventEncoder telemetry RoutePublisherTelemetry clock Clock workerToken string logger *slog.Logger } // NewPushPublisher constructs the push publication worker. func NewPushPublisher(cfg PushPublisherConfig, logger *slog.Logger) (*PushPublisher, error) { switch { case cfg.Store == nil: return nil, errors.New("new push publisher: nil store") case strings.TrimSpace(cfg.GatewayStream) == "": return nil, errors.New("new push publisher: gateway stream must not be empty") case cfg.GatewayStreamMaxLen <= 0: return nil, errors.New("new push publisher: gateway stream max len must be positive") case cfg.RouteLeaseTTL <= 0: return nil, errors.New("new push publisher: route lease ttl must be positive") case cfg.RouteBackoffMin <= 0: return nil, errors.New("new push publisher: route backoff min must be positive") case cfg.RouteBackoffMax <= 0: return nil, errors.New("new push publisher: route backoff max must be positive") case cfg.RouteBackoffMin > cfg.RouteBackoffMax: return nil, errors.New("new push publisher: route backoff min must not exceed route backoff max") } if cfg.PollInterval <= 0 { cfg.PollInterval = defaultPushPublisherPollInterval } if cfg.BatchSize <= 0 { cfg.BatchSize = defaultPushPublisherBatchSize } if cfg.Clock == nil { cfg.Clock = systemClock{} } if cfg.Encoder == nil { cfg.Encoder = publishpush.Encoder{} } if logger == nil { logger = slog.Default() } workerToken, err := newWorkerToken() if err != nil { return nil, fmt.Errorf("new push publisher: %w", err) } return &PushPublisher{ store: cfg.Store, gatewayStream: cfg.GatewayStream, gatewayStreamMaxLen: cfg.GatewayStreamMaxLen, routeLeaseTTL: cfg.RouteLeaseTTL, routeBackoffMin: cfg.RouteBackoffMin, routeBackoffMax: cfg.RouteBackoffMax, pollInterval: cfg.PollInterval, batchSize: cfg.BatchSize, encoder: cfg.Encoder, telemetry: cfg.Telemetry, clock: cfg.Clock, workerToken: workerToken, logger: logger.With("component", "push_publisher", "stream", cfg.GatewayStream), }, nil } // Run starts the push publication loop and blocks until ctx is canceled or an // unexpected publication error occurs. func (publisher *PushPublisher) Run(ctx context.Context) error { if ctx == nil { return errors.New("run push publisher: nil context") } if err := ctx.Err(); err != nil { return err } if publisher == nil { return errors.New("run push publisher: nil publisher") } publisher.logger.Info("push publisher started", "poll_interval", publisher.pollInterval.String(), "batch_size", publisher.batchSize, ) for { progress, err := publisher.publishDueRoutes(ctx) switch { case err == nil && progress: continue case err == nil: if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil { publisher.logger.Info("push publisher stopped") return waitErr } case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)): publisher.logger.Info("push publisher stopped") return ctx.Err() default: return fmt.Errorf("run push publisher: %w", err) } } } // Shutdown stops the push publisher within ctx. The worker relies on context // cancellation and a bounded polling interval, so it has no dedicated // resources to release here. func (publisher *PushPublisher) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown push publisher: nil context") } if publisher == nil { return nil } return nil } func (publisher *PushPublisher) publishDueRoutes(ctx context.Context) (bool, error) { now := publisher.now() dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize) if err != nil { return false, err } progress := false for _, dueRoute := range dueRoutes { if !strings.HasPrefix(dueRoute.RouteID, "push:") { continue } processed, err := publisher.publishRoute(ctx, now, dueRoute) if err != nil { return progress, err } progress = progress || processed } return progress, nil } func (publisher *PushPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute redisstate.ScheduledRoute) (bool, error) { acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL) if err != nil { return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err) } if !acquired { return false, nil } defer func() { releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL) defer cancel() _ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken) }() notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID) if err != nil { return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err) } if !found { return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID) } route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID) if err != nil { return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err) } if !found { return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID) } if route.Channel != intentstream.ChannelPush { return false, nil } switch route.Status { case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed: default: return false, nil } if route.NextAttemptAt.After(now) { return false, nil } event, err := publisher.encoder.Encode(notification, route) if err != nil { return publisher.recordFailure(ctx, notification, route, pushFailureClassificationPayloadEncoding, err.Error()) } err = publisher.store.CompleteRoutePublished(ctx, redisstate.CompleteRoutePublishedInput{ ExpectedRoute: route, LeaseToken: publisher.workerToken, PublishedAt: publisher.now(), Stream: publisher.gatewayStream, StreamMaxLen: publisher.gatewayStreamMaxLen, StreamValues: eventValues(event), }) switch { case err == nil: publisher.recordPublishAttempt(ctx, notification, route, "published", "") logArgs := logging.RouteAttrs( notification.NotificationID, notification.NotificationType, notification.Producer, notification.AudienceKind, notification.IdempotencyKey, notification.RequestID, notification.TraceID, route.RouteID, route.Channel, ) logArgs = append(logArgs, "event_id", event.EventID, "user_id", event.UserID, ) logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...) publisher.logger.Info("push route published", logArgs...) return true, nil case errors.Is(err, redisstate.ErrConflict): return false, nil default: return publisher.recordFailure(ctx, notification, route, pushFailureClassificationGatewayStreamWrite, err.Error()) } } func (publisher *PushPublisher) recordFailure( ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string, message string, ) (bool, error) { failureAt := publisher.now() attemptNumber := route.AttemptCount + 1 logArgs := logging.RouteAttrs( notification.NotificationID, notification.NotificationType, notification.Producer, notification.AudienceKind, notification.IdempotencyKey, notification.RequestID, notification.TraceID, route.RouteID, route.Channel, ) logArgs = append(logArgs, "failure_classification", classification, "failure_message", strings.TrimSpace(message), "attempt_number", attemptNumber, "max_attempts", route.MaxAttempts, ) logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...) if attemptNumber >= route.MaxAttempts { err := publisher.store.CompleteRouteDeadLetter(ctx, redisstate.CompleteRouteDeadLetterInput{ ExpectedRoute: route, LeaseToken: publisher.workerToken, DeadLetteredAt: failureAt, FailureClassification: classification, FailureMessage: strings.TrimSpace(message), }) switch { case err == nil: publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification) publisher.recordRouteDeadLetter(ctx, notification, route, classification) publisher.logger.Warn("push route dead-lettered", logArgs...) return true, nil case errors.Is(err, redisstate.ErrConflict): return false, nil default: return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err) } } nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond) err := publisher.store.CompleteRouteFailed(ctx, redisstate.CompleteRouteFailedInput{ ExpectedRoute: route, LeaseToken: publisher.workerToken, FailedAt: failureAt, NextAttemptAt: nextAttemptAt, FailureClassification: classification, FailureMessage: strings.TrimSpace(message), }) switch { case err == nil: publisher.recordPublishAttempt(ctx, notification, route, "retry", classification) publisher.recordRouteRetry(ctx, notification, route) logArgs = append(logArgs, "next_attempt_at", nextAttemptAt) publisher.logger.Warn("push route failed and was rescheduled", logArgs...) return true, nil case errors.Is(err, redisstate.ErrConflict): return false, nil default: return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err) } } func eventValues(event publishpush.Event) map[string]any { values := map[string]any{ "user_id": event.UserID, "event_type": event.EventType, "event_id": event.EventID, "payload_bytes": append([]byte(nil), event.PayloadBytes...), } if event.RequestID != "" { values["request_id"] = event.RequestID } if event.TraceID != "" { values["trace_id"] = event.TraceID } return values } func routeBackoffDelay(attemptNumber int, minBackoff time.Duration, maxBackoff time.Duration) time.Duration { delay := minBackoff for step := 1; step < attemptNumber; step++ { if delay >= maxBackoff/2 { return maxBackoff } delay *= 2 } if delay < minBackoff { return minBackoff } if delay > maxBackoff { return maxBackoff } return delay } func waitWithContext(ctx context.Context, delay time.Duration) error { timer := time.NewTimer(delay) defer timer.Stop() select { case <-ctx.Done(): return ctx.Err() case <-timer.C: return nil } } func newWorkerToken() (string, error) { buffer := make([]byte, 16) if _, err := rand.Read(buffer); err != nil { return "", fmt.Errorf("generate worker token: %w", err) } return hex.EncodeToString(buffer), nil } func (publisher *PushPublisher) now() time.Time { return publisher.clock.Now().UTC().Truncate(time.Millisecond) } func (publisher *PushPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) { if publisher == nil || publisher.telemetry == nil { return } publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification) } func (publisher *PushPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) { if publisher == nil || publisher.telemetry == nil { return } publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType)) } func (publisher *PushPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) { if publisher == nil || publisher.telemetry == nil { return } publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification) }