522 lines
17 KiB
Go
522 lines
17 KiB
Go
package worker
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
|
|
"galaxy/notification/internal/api/intentstream"
|
|
"galaxy/notification/internal/logging"
|
|
"galaxy/notification/internal/service/acceptintent"
|
|
"galaxy/notification/internal/service/publishpush"
|
|
"galaxy/notification/internal/service/routestate"
|
|
|
|
"github.com/redis/go-redis/v9"
|
|
)
|
|
|
|
const (
|
|
defaultPushPublisherPollInterval = 100 * time.Millisecond
|
|
defaultPushPublisherBatchSize = 64
|
|
|
|
pushFailureClassificationPayloadEncoding = "payload_encoding_failed"
|
|
pushFailureClassificationGatewayStreamWrite = "gateway_stream_publish_failed"
|
|
)
|
|
|
|
// PushRouteStateStore describes the durable route-state operations required by
|
|
// PushPublisher.
|
|
type PushRouteStateStore interface {
|
|
// ListDueRoutes loads due scheduled routes.
|
|
ListDueRoutes(context.Context, time.Time, int64) ([]routestate.ScheduledRoute, error)
|
|
|
|
// TryAcquireRouteLease attempts to acquire one temporary route lease.
|
|
TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error)
|
|
|
|
// ReleaseRouteLease best-effort releases one temporary route lease.
|
|
ReleaseRouteLease(context.Context, string, string, string) error
|
|
|
|
// GetNotification loads one accepted notification.
|
|
GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error)
|
|
|
|
// GetRoute loads one accepted notification route.
|
|
GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error)
|
|
|
|
// CompleteRoutePublished records one successful publication.
|
|
CompleteRoutePublished(context.Context, routestate.CompleteRoutePublishedInput) error
|
|
|
|
// CompleteRouteFailed records one retryable publication failure.
|
|
CompleteRouteFailed(context.Context, routestate.CompleteRouteFailedInput) error
|
|
|
|
// CompleteRouteDeadLetter records one exhausted publication failure.
|
|
CompleteRouteDeadLetter(context.Context, routestate.CompleteRouteDeadLetterInput) error
|
|
}
|
|
|
|
// PushEventEncoder encodes one push-capable notification route into a
|
|
// Gateway-compatible client event.
|
|
type PushEventEncoder interface {
|
|
// Encode converts notification plus route to one outbound event.
|
|
Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishpush.Event, error)
|
|
}
|
|
|
|
// RoutePublisherTelemetry records low-cardinality route publication outcomes.
|
|
type RoutePublisherTelemetry interface {
|
|
// RecordRoutePublishAttempt records one route publication attempt outcome.
|
|
RecordRoutePublishAttempt(context.Context, string, string, string, string)
|
|
|
|
// RecordRouteRetry records one route retry scheduling event.
|
|
RecordRouteRetry(context.Context, string, string)
|
|
|
|
// RecordRouteDeadLetter records one route transition to dead_letter.
|
|
RecordRouteDeadLetter(context.Context, string, string, string)
|
|
}
|
|
|
|
// PushPublisherConfig stores the dependencies and policies used by
|
|
// PushPublisher.
|
|
type PushPublisherConfig struct {
|
|
// Store owns the durable route-state transitions.
|
|
Store PushRouteStateStore
|
|
|
|
// GatewayStream stores the outbound Gateway client-events stream name.
|
|
GatewayStream string
|
|
|
|
// GatewayStreamMaxLen bounds GatewayStream with approximate trimming.
|
|
GatewayStreamMaxLen int64
|
|
|
|
// RouteLeaseTTL stores the temporary route-lease lifetime.
|
|
RouteLeaseTTL time.Duration
|
|
|
|
// RouteBackoffMin stores the minimum retry backoff.
|
|
RouteBackoffMin time.Duration
|
|
|
|
// RouteBackoffMax stores the maximum retry backoff.
|
|
RouteBackoffMax time.Duration
|
|
|
|
// PollInterval stores how long the worker waits before the next due-route
|
|
// scan when no progress was made.
|
|
PollInterval time.Duration
|
|
|
|
// BatchSize stores the maximum number of due schedule members loaded per
|
|
// scan.
|
|
BatchSize int64
|
|
|
|
// Encoder stores the push payload encoder.
|
|
Encoder PushEventEncoder
|
|
|
|
// Telemetry records route publication counters.
|
|
Telemetry RoutePublisherTelemetry
|
|
|
|
// Clock provides wall-clock timestamps.
|
|
Clock Clock
|
|
|
|
// StreamPublisher emits the outbound Gateway client-event before the
|
|
// route's PostgreSQL state transition is committed.
|
|
StreamPublisher StreamPublisher
|
|
}
|
|
|
|
// PushPublisher publishes due push routes into the Gateway client-events
|
|
// stream with retry and dead-letter handling.
|
|
type PushPublisher struct {
|
|
store PushRouteStateStore
|
|
gatewayStream string
|
|
gatewayStreamMaxLen int64
|
|
routeLeaseTTL time.Duration
|
|
routeBackoffMin time.Duration
|
|
routeBackoffMax time.Duration
|
|
pollInterval time.Duration
|
|
batchSize int64
|
|
encoder PushEventEncoder
|
|
telemetry RoutePublisherTelemetry
|
|
clock Clock
|
|
streamPublisher StreamPublisher
|
|
workerToken string
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewPushPublisher constructs the push publication worker.
|
|
func NewPushPublisher(cfg PushPublisherConfig, logger *slog.Logger) (*PushPublisher, error) {
|
|
switch {
|
|
case cfg.Store == nil:
|
|
return nil, errors.New("new push publisher: nil store")
|
|
case cfg.StreamPublisher == nil:
|
|
return nil, errors.New("new push publisher: nil stream publisher")
|
|
case strings.TrimSpace(cfg.GatewayStream) == "":
|
|
return nil, errors.New("new push publisher: gateway stream must not be empty")
|
|
case cfg.GatewayStreamMaxLen <= 0:
|
|
return nil, errors.New("new push publisher: gateway stream max len must be positive")
|
|
case cfg.RouteLeaseTTL <= 0:
|
|
return nil, errors.New("new push publisher: route lease ttl must be positive")
|
|
case cfg.RouteBackoffMin <= 0:
|
|
return nil, errors.New("new push publisher: route backoff min must be positive")
|
|
case cfg.RouteBackoffMax <= 0:
|
|
return nil, errors.New("new push publisher: route backoff max must be positive")
|
|
case cfg.RouteBackoffMin > cfg.RouteBackoffMax:
|
|
return nil, errors.New("new push publisher: route backoff min must not exceed route backoff max")
|
|
}
|
|
if cfg.PollInterval <= 0 {
|
|
cfg.PollInterval = defaultPushPublisherPollInterval
|
|
}
|
|
if cfg.BatchSize <= 0 {
|
|
cfg.BatchSize = defaultPushPublisherBatchSize
|
|
}
|
|
if cfg.Clock == nil {
|
|
cfg.Clock = systemClock{}
|
|
}
|
|
if cfg.Encoder == nil {
|
|
cfg.Encoder = publishpush.Encoder{}
|
|
}
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
workerToken, err := newWorkerToken()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("new push publisher: %w", err)
|
|
}
|
|
|
|
return &PushPublisher{
|
|
store: cfg.Store,
|
|
gatewayStream: cfg.GatewayStream,
|
|
gatewayStreamMaxLen: cfg.GatewayStreamMaxLen,
|
|
routeLeaseTTL: cfg.RouteLeaseTTL,
|
|
routeBackoffMin: cfg.RouteBackoffMin,
|
|
routeBackoffMax: cfg.RouteBackoffMax,
|
|
pollInterval: cfg.PollInterval,
|
|
batchSize: cfg.BatchSize,
|
|
encoder: cfg.Encoder,
|
|
telemetry: cfg.Telemetry,
|
|
clock: cfg.Clock,
|
|
streamPublisher: cfg.StreamPublisher,
|
|
workerToken: workerToken,
|
|
logger: logger.With("component", "push_publisher", "stream", cfg.GatewayStream),
|
|
}, nil
|
|
}
|
|
|
|
// Run starts the push publication loop and blocks until ctx is canceled or an
|
|
// unexpected publication error occurs.
|
|
func (publisher *PushPublisher) Run(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("run push publisher: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if publisher == nil {
|
|
return errors.New("run push publisher: nil publisher")
|
|
}
|
|
|
|
publisher.logger.Info("push publisher started",
|
|
"poll_interval", publisher.pollInterval.String(),
|
|
"batch_size", publisher.batchSize,
|
|
)
|
|
|
|
for {
|
|
progress, err := publisher.publishDueRoutes(ctx)
|
|
switch {
|
|
case err == nil && progress:
|
|
continue
|
|
case err == nil:
|
|
if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil {
|
|
publisher.logger.Info("push publisher stopped")
|
|
return waitErr
|
|
}
|
|
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)):
|
|
publisher.logger.Info("push publisher stopped")
|
|
return ctx.Err()
|
|
default:
|
|
return fmt.Errorf("run push publisher: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown stops the push publisher within ctx. The worker relies on context
|
|
// cancellation and a bounded polling interval, so it has no dedicated
|
|
// resources to release here.
|
|
func (publisher *PushPublisher) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown push publisher: nil context")
|
|
}
|
|
if publisher == nil {
|
|
return nil
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (publisher *PushPublisher) publishDueRoutes(ctx context.Context) (bool, error) {
|
|
now := publisher.now()
|
|
|
|
dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
progress := false
|
|
for _, dueRoute := range dueRoutes {
|
|
if !strings.HasPrefix(dueRoute.RouteID, "push:") {
|
|
continue
|
|
}
|
|
|
|
processed, err := publisher.publishRoute(ctx, now, dueRoute)
|
|
if err != nil {
|
|
return progress, err
|
|
}
|
|
progress = progress || processed
|
|
}
|
|
|
|
return progress, nil
|
|
}
|
|
|
|
func (publisher *PushPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute routestate.ScheduledRoute) (bool, error) {
|
|
acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL)
|
|
if err != nil {
|
|
return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err)
|
|
}
|
|
if !acquired {
|
|
return false, nil
|
|
}
|
|
defer func() {
|
|
releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL)
|
|
defer cancel()
|
|
_ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken)
|
|
}()
|
|
|
|
notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID)
|
|
if err != nil {
|
|
return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err)
|
|
}
|
|
if !found {
|
|
return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID)
|
|
}
|
|
|
|
route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID)
|
|
if err != nil {
|
|
return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err)
|
|
}
|
|
if !found {
|
|
return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID)
|
|
}
|
|
if route.Channel != intentstream.ChannelPush {
|
|
return false, nil
|
|
}
|
|
switch route.Status {
|
|
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
|
|
default:
|
|
return false, nil
|
|
}
|
|
if route.NextAttemptAt.After(now) {
|
|
return false, nil
|
|
}
|
|
|
|
event, err := publisher.encoder.Encode(notification, route)
|
|
if err != nil {
|
|
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationPayloadEncoding, err.Error())
|
|
}
|
|
|
|
xaddArgs := &redis.XAddArgs{
|
|
Stream: publisher.gatewayStream,
|
|
Values: eventValues(event),
|
|
}
|
|
if publisher.gatewayStreamMaxLen > 0 {
|
|
xaddArgs.MaxLen = publisher.gatewayStreamMaxLen
|
|
xaddArgs.Approx = true
|
|
}
|
|
if err := publisher.streamPublisher.XAdd(ctx, xaddArgs).Err(); err != nil {
|
|
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationGatewayStreamWrite, err.Error())
|
|
}
|
|
|
|
err = publisher.store.CompleteRoutePublished(ctx, routestate.CompleteRoutePublishedInput{
|
|
ExpectedRoute: route,
|
|
LeaseToken: publisher.workerToken,
|
|
PublishedAt: publisher.now(),
|
|
Stream: publisher.gatewayStream,
|
|
StreamMaxLen: publisher.gatewayStreamMaxLen,
|
|
StreamValues: eventValues(event),
|
|
})
|
|
switch {
|
|
case err == nil:
|
|
publisher.recordPublishAttempt(ctx, notification, route, "published", "")
|
|
logArgs := logging.RouteAttrs(
|
|
notification.NotificationID,
|
|
notification.NotificationType,
|
|
notification.Producer,
|
|
notification.AudienceKind,
|
|
notification.IdempotencyKey,
|
|
notification.RequestID,
|
|
notification.TraceID,
|
|
route.RouteID,
|
|
route.Channel,
|
|
)
|
|
logArgs = append(logArgs,
|
|
"event_id", event.EventID,
|
|
"user_id", event.UserID,
|
|
)
|
|
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
|
publisher.logger.Info("push route published", logArgs...)
|
|
return true, nil
|
|
case errors.Is(err, routestate.ErrConflict):
|
|
return false, nil
|
|
default:
|
|
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationGatewayStreamWrite, err.Error())
|
|
}
|
|
}
|
|
|
|
func (publisher *PushPublisher) recordFailure(
|
|
ctx context.Context,
|
|
notification acceptintent.NotificationRecord,
|
|
route acceptintent.NotificationRoute,
|
|
classification string,
|
|
message string,
|
|
) (bool, error) {
|
|
failureAt := publisher.now()
|
|
attemptNumber := route.AttemptCount + 1
|
|
logArgs := logging.RouteAttrs(
|
|
notification.NotificationID,
|
|
notification.NotificationType,
|
|
notification.Producer,
|
|
notification.AudienceKind,
|
|
notification.IdempotencyKey,
|
|
notification.RequestID,
|
|
notification.TraceID,
|
|
route.RouteID,
|
|
route.Channel,
|
|
)
|
|
logArgs = append(logArgs,
|
|
"failure_classification", classification,
|
|
"failure_message", strings.TrimSpace(message),
|
|
"attempt_number", attemptNumber,
|
|
"max_attempts", route.MaxAttempts,
|
|
)
|
|
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
|
|
|
if attemptNumber >= route.MaxAttempts {
|
|
err := publisher.store.CompleteRouteDeadLetter(ctx, routestate.CompleteRouteDeadLetterInput{
|
|
ExpectedRoute: route,
|
|
LeaseToken: publisher.workerToken,
|
|
DeadLetteredAt: failureAt,
|
|
FailureClassification: classification,
|
|
FailureMessage: strings.TrimSpace(message),
|
|
})
|
|
switch {
|
|
case err == nil:
|
|
publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification)
|
|
publisher.recordRouteDeadLetter(ctx, notification, route, classification)
|
|
publisher.logger.Warn("push route dead-lettered", logArgs...)
|
|
return true, nil
|
|
case errors.Is(err, routestate.ErrConflict):
|
|
return false, nil
|
|
default:
|
|
return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err)
|
|
}
|
|
}
|
|
|
|
nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond)
|
|
err := publisher.store.CompleteRouteFailed(ctx, routestate.CompleteRouteFailedInput{
|
|
ExpectedRoute: route,
|
|
LeaseToken: publisher.workerToken,
|
|
FailedAt: failureAt,
|
|
NextAttemptAt: nextAttemptAt,
|
|
FailureClassification: classification,
|
|
FailureMessage: strings.TrimSpace(message),
|
|
})
|
|
switch {
|
|
case err == nil:
|
|
publisher.recordPublishAttempt(ctx, notification, route, "retry", classification)
|
|
publisher.recordRouteRetry(ctx, notification, route)
|
|
logArgs = append(logArgs, "next_attempt_at", nextAttemptAt)
|
|
publisher.logger.Warn("push route failed and was rescheduled", logArgs...)
|
|
return true, nil
|
|
case errors.Is(err, routestate.ErrConflict):
|
|
return false, nil
|
|
default:
|
|
return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err)
|
|
}
|
|
}
|
|
|
|
func eventValues(event publishpush.Event) map[string]any {
|
|
values := map[string]any{
|
|
"user_id": event.UserID,
|
|
"event_type": event.EventType,
|
|
"event_id": event.EventID,
|
|
"payload_bytes": append([]byte(nil), event.PayloadBytes...),
|
|
}
|
|
if event.RequestID != "" {
|
|
values["request_id"] = event.RequestID
|
|
}
|
|
if event.TraceID != "" {
|
|
values["trace_id"] = event.TraceID
|
|
}
|
|
|
|
return values
|
|
}
|
|
|
|
func routeBackoffDelay(attemptNumber int, minBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
|
delay := minBackoff
|
|
for step := 1; step < attemptNumber; step++ {
|
|
if delay >= maxBackoff/2 {
|
|
return maxBackoff
|
|
}
|
|
delay *= 2
|
|
}
|
|
if delay < minBackoff {
|
|
return minBackoff
|
|
}
|
|
if delay > maxBackoff {
|
|
return maxBackoff
|
|
}
|
|
|
|
return delay
|
|
}
|
|
|
|
func waitWithContext(ctx context.Context, delay time.Duration) error {
|
|
timer := time.NewTimer(delay)
|
|
defer timer.Stop()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-timer.C:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func newWorkerToken() (string, error) {
|
|
buffer := make([]byte, 16)
|
|
if _, err := rand.Read(buffer); err != nil {
|
|
return "", fmt.Errorf("generate worker token: %w", err)
|
|
}
|
|
|
|
return hex.EncodeToString(buffer), nil
|
|
}
|
|
|
|
func (publisher *PushPublisher) now() time.Time {
|
|
return publisher.clock.Now().UTC().Truncate(time.Millisecond)
|
|
}
|
|
|
|
func (publisher *PushPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) {
|
|
if publisher == nil || publisher.telemetry == nil {
|
|
return
|
|
}
|
|
|
|
publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification)
|
|
}
|
|
|
|
func (publisher *PushPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) {
|
|
if publisher == nil || publisher.telemetry == nil {
|
|
return
|
|
}
|
|
|
|
publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType))
|
|
}
|
|
|
|
func (publisher *PushPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) {
|
|
if publisher == nil || publisher.telemetry == nil {
|
|
return
|
|
}
|
|
|
|
publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification)
|
|
}
|