feat: notification service
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
// Package worker provides the long-lived background components used by the
|
||||
// runnable Notification Service process.
|
||||
package worker
|
||||
@@ -0,0 +1,421 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/notification/internal/adapters/redisstate"
|
||||
"galaxy/notification/internal/api/intentstream"
|
||||
"galaxy/notification/internal/logging"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
"galaxy/notification/internal/service/publishmail"
|
||||
)
|
||||
|
||||
const (
|
||||
emailFailureClassificationPayloadEncoding = "payload_encoding_failed"
|
||||
emailFailureClassificationMailStreamWrite = "mail_stream_publish_failed"
|
||||
)
|
||||
|
||||
// EmailRouteStateStore describes the durable route-state operations required
|
||||
// by EmailPublisher.
|
||||
type EmailRouteStateStore interface {
|
||||
// ListDueRoutes loads due scheduled routes.
|
||||
ListDueRoutes(context.Context, time.Time, int64) ([]redisstate.ScheduledRoute, error)
|
||||
|
||||
// TryAcquireRouteLease attempts to acquire one temporary route lease.
|
||||
TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error)
|
||||
|
||||
// ReleaseRouteLease best-effort releases one temporary route lease.
|
||||
ReleaseRouteLease(context.Context, string, string, string) error
|
||||
|
||||
// GetNotification loads one accepted notification.
|
||||
GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error)
|
||||
|
||||
// GetRoute loads one accepted notification route.
|
||||
GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error)
|
||||
|
||||
// CompleteRoutePublished records one successful publication.
|
||||
CompleteRoutePublished(context.Context, redisstate.CompleteRoutePublishedInput) error
|
||||
|
||||
// CompleteRouteFailed records one retryable publication failure.
|
||||
CompleteRouteFailed(context.Context, redisstate.CompleteRouteFailedInput) error
|
||||
|
||||
// CompleteRouteDeadLetter records one exhausted publication failure.
|
||||
CompleteRouteDeadLetter(context.Context, redisstate.CompleteRouteDeadLetterInput) error
|
||||
}
|
||||
|
||||
// EmailCommandEncoder encodes one email-capable notification route into a
|
||||
// Mail Service-compatible generic command.
|
||||
type EmailCommandEncoder interface {
|
||||
// Encode converts notification plus route to one outbound command.
|
||||
Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishmail.Command, error)
|
||||
}
|
||||
|
||||
// EmailPublisherConfig stores the dependencies and policies used by
|
||||
// EmailPublisher.
|
||||
type EmailPublisherConfig struct {
|
||||
// Store owns the durable route-state transitions.
|
||||
Store EmailRouteStateStore
|
||||
|
||||
// MailDeliveryCommandsStream stores the outbound Mail Service command
|
||||
// stream name.
|
||||
MailDeliveryCommandsStream string
|
||||
|
||||
// RouteLeaseTTL stores the temporary route-lease lifetime.
|
||||
RouteLeaseTTL time.Duration
|
||||
|
||||
// RouteBackoffMin stores the minimum retry backoff.
|
||||
RouteBackoffMin time.Duration
|
||||
|
||||
// RouteBackoffMax stores the maximum retry backoff.
|
||||
RouteBackoffMax time.Duration
|
||||
|
||||
// PollInterval stores how long the worker waits before the next due-route
|
||||
// scan when no progress was made.
|
||||
PollInterval time.Duration
|
||||
|
||||
// BatchSize stores the maximum number of due schedule members loaded per
|
||||
// scan.
|
||||
BatchSize int64
|
||||
|
||||
// Encoder stores the email command encoder.
|
||||
Encoder EmailCommandEncoder
|
||||
|
||||
// Telemetry records route publication counters.
|
||||
Telemetry RoutePublisherTelemetry
|
||||
|
||||
// Clock provides wall-clock timestamps.
|
||||
Clock Clock
|
||||
}
|
||||
|
||||
// EmailPublisher publishes due email routes into the Mail Service command
|
||||
// stream with retry and dead-letter handling.
|
||||
type EmailPublisher struct {
|
||||
store EmailRouteStateStore
|
||||
mailDeliveryCommandsStream string
|
||||
routeLeaseTTL time.Duration
|
||||
routeBackoffMin time.Duration
|
||||
routeBackoffMax time.Duration
|
||||
pollInterval time.Duration
|
||||
batchSize int64
|
||||
encoder EmailCommandEncoder
|
||||
telemetry RoutePublisherTelemetry
|
||||
clock Clock
|
||||
workerToken string
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewEmailPublisher constructs the email publication worker.
|
||||
func NewEmailPublisher(cfg EmailPublisherConfig, logger *slog.Logger) (*EmailPublisher, error) {
|
||||
switch {
|
||||
case cfg.Store == nil:
|
||||
return nil, errors.New("new email publisher: nil store")
|
||||
case strings.TrimSpace(cfg.MailDeliveryCommandsStream) == "":
|
||||
return nil, errors.New("new email publisher: mail delivery-commands stream must not be empty")
|
||||
case cfg.RouteLeaseTTL <= 0:
|
||||
return nil, errors.New("new email publisher: route lease ttl must be positive")
|
||||
case cfg.RouteBackoffMin <= 0:
|
||||
return nil, errors.New("new email publisher: route backoff min must be positive")
|
||||
case cfg.RouteBackoffMax <= 0:
|
||||
return nil, errors.New("new email publisher: route backoff max must be positive")
|
||||
case cfg.RouteBackoffMin > cfg.RouteBackoffMax:
|
||||
return nil, errors.New("new email publisher: route backoff min must not exceed route backoff max")
|
||||
}
|
||||
if cfg.PollInterval <= 0 {
|
||||
cfg.PollInterval = defaultPushPublisherPollInterval
|
||||
}
|
||||
if cfg.BatchSize <= 0 {
|
||||
cfg.BatchSize = defaultPushPublisherBatchSize
|
||||
}
|
||||
if cfg.Clock == nil {
|
||||
cfg.Clock = systemClock{}
|
||||
}
|
||||
if cfg.Encoder == nil {
|
||||
cfg.Encoder = publishmail.Encoder{}
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
workerToken, err := newWorkerToken()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new email publisher: %w", err)
|
||||
}
|
||||
|
||||
return &EmailPublisher{
|
||||
store: cfg.Store,
|
||||
mailDeliveryCommandsStream: cfg.MailDeliveryCommandsStream,
|
||||
routeLeaseTTL: cfg.RouteLeaseTTL,
|
||||
routeBackoffMin: cfg.RouteBackoffMin,
|
||||
routeBackoffMax: cfg.RouteBackoffMax,
|
||||
pollInterval: cfg.PollInterval,
|
||||
batchSize: cfg.BatchSize,
|
||||
encoder: cfg.Encoder,
|
||||
telemetry: cfg.Telemetry,
|
||||
clock: cfg.Clock,
|
||||
workerToken: workerToken,
|
||||
logger: logger.With("component", "email_publisher", "stream", cfg.MailDeliveryCommandsStream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run starts the email publication loop and blocks until ctx is canceled or
|
||||
// an unexpected publication error occurs.
|
||||
func (publisher *EmailPublisher) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run email publisher: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if publisher == nil {
|
||||
return errors.New("run email publisher: nil publisher")
|
||||
}
|
||||
|
||||
publisher.logger.Info("email publisher started",
|
||||
"poll_interval", publisher.pollInterval.String(),
|
||||
"batch_size", publisher.batchSize,
|
||||
)
|
||||
|
||||
for {
|
||||
progress, err := publisher.publishDueRoutes(ctx)
|
||||
switch {
|
||||
case err == nil && progress:
|
||||
continue
|
||||
case err == nil:
|
||||
if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil {
|
||||
publisher.logger.Info("email publisher stopped")
|
||||
return waitErr
|
||||
}
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)):
|
||||
publisher.logger.Info("email publisher stopped")
|
||||
return ctx.Err()
|
||||
default:
|
||||
return fmt.Errorf("run email publisher: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown stops the email publisher within ctx. The worker relies on context
|
||||
// cancellation and a bounded polling interval, so it has no dedicated
|
||||
// resources to release here.
|
||||
func (publisher *EmailPublisher) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown email publisher: nil context")
|
||||
}
|
||||
if publisher == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) publishDueRoutes(ctx context.Context) (bool, error) {
|
||||
now := publisher.now()
|
||||
|
||||
dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
progress := false
|
||||
for _, dueRoute := range dueRoutes {
|
||||
if !strings.HasPrefix(dueRoute.RouteID, "email:") {
|
||||
continue
|
||||
}
|
||||
|
||||
processed, err := publisher.publishRoute(ctx, now, dueRoute)
|
||||
if err != nil {
|
||||
return progress, err
|
||||
}
|
||||
progress = progress || processed
|
||||
}
|
||||
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute redisstate.ScheduledRoute) (bool, error) {
|
||||
acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err)
|
||||
}
|
||||
if !acquired {
|
||||
return false, nil
|
||||
}
|
||||
defer func() {
|
||||
releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL)
|
||||
defer cancel()
|
||||
_ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken)
|
||||
}()
|
||||
|
||||
notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err)
|
||||
}
|
||||
if !found {
|
||||
return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID)
|
||||
}
|
||||
|
||||
route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err)
|
||||
}
|
||||
if !found {
|
||||
return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID)
|
||||
}
|
||||
if route.Channel != intentstream.ChannelEmail {
|
||||
return false, nil
|
||||
}
|
||||
switch route.Status {
|
||||
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
|
||||
default:
|
||||
return false, nil
|
||||
}
|
||||
if route.NextAttemptAt.After(now) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
command, err := publisher.encoder.Encode(notification, route)
|
||||
if err != nil {
|
||||
return publisher.recordFailure(ctx, notification, route, emailFailureClassificationPayloadEncoding, err.Error())
|
||||
}
|
||||
|
||||
err = publisher.store.CompleteRoutePublished(ctx, redisstate.CompleteRoutePublishedInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
PublishedAt: publisher.now(),
|
||||
Stream: publisher.mailDeliveryCommandsStream,
|
||||
StreamMaxLen: 0,
|
||||
StreamValues: command.Values(),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "published", "")
|
||||
logArgs := logging.RouteAttrs(
|
||||
notification.NotificationID,
|
||||
notification.NotificationType,
|
||||
notification.Producer,
|
||||
notification.AudienceKind,
|
||||
notification.IdempotencyKey,
|
||||
notification.RequestID,
|
||||
notification.TraceID,
|
||||
route.RouteID,
|
||||
route.Channel,
|
||||
)
|
||||
logArgs = append(logArgs,
|
||||
"delivery_id", command.DeliveryID,
|
||||
"resolved_email", route.ResolvedEmail,
|
||||
)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
publisher.logger.Info("email route published", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return publisher.recordFailure(ctx, notification, route, emailFailureClassificationMailStreamWrite, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) recordFailure(
|
||||
ctx context.Context,
|
||||
notification acceptintent.NotificationRecord,
|
||||
route acceptintent.NotificationRoute,
|
||||
classification string,
|
||||
message string,
|
||||
) (bool, error) {
|
||||
failureAt := publisher.now()
|
||||
attemptNumber := route.AttemptCount + 1
|
||||
logArgs := logging.RouteAttrs(
|
||||
notification.NotificationID,
|
||||
notification.NotificationType,
|
||||
notification.Producer,
|
||||
notification.AudienceKind,
|
||||
notification.IdempotencyKey,
|
||||
notification.RequestID,
|
||||
notification.TraceID,
|
||||
route.RouteID,
|
||||
route.Channel,
|
||||
)
|
||||
logArgs = append(logArgs,
|
||||
"resolved_email", route.ResolvedEmail,
|
||||
"failure_classification", classification,
|
||||
"failure_message", strings.TrimSpace(message),
|
||||
"attempt_number", attemptNumber,
|
||||
"max_attempts", route.MaxAttempts,
|
||||
)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
|
||||
if attemptNumber >= route.MaxAttempts {
|
||||
err := publisher.store.CompleteRouteDeadLetter(ctx, redisstate.CompleteRouteDeadLetterInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
DeadLetteredAt: failureAt,
|
||||
FailureClassification: classification,
|
||||
FailureMessage: strings.TrimSpace(message),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification)
|
||||
publisher.recordRouteDeadLetter(ctx, notification, route, classification)
|
||||
publisher.logger.Warn("email route dead-lettered", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err)
|
||||
}
|
||||
}
|
||||
|
||||
nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond)
|
||||
err := publisher.store.CompleteRouteFailed(ctx, redisstate.CompleteRouteFailedInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
FailedAt: failureAt,
|
||||
NextAttemptAt: nextAttemptAt,
|
||||
FailureClassification: classification,
|
||||
FailureMessage: strings.TrimSpace(message),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "retry", classification)
|
||||
publisher.recordRouteRetry(ctx, notification, route)
|
||||
logArgs = append(logArgs, "next_attempt_at", nextAttemptAt)
|
||||
publisher.logger.Warn("email route failed and was rescheduled", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) now() time.Time {
|
||||
return publisher.clock.Now().UTC().Truncate(time.Millisecond)
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification)
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType))
|
||||
}
|
||||
|
||||
func (publisher *EmailPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification)
|
||||
}
|
||||
@@ -0,0 +1,232 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
redisstate "galaxy/notification/internal/adapters/redisstate"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestEmailPublisherPublishesDueEmailRouteAndLeavesPushRoutePending(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newEmailPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
|
||||
|
||||
running := runEmailPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
pushRoute, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
require.Equal(t, acceptintent.RouteStatusPending, pushRoute.Status)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
require.Equal(t, "1775121700000-0/email:user:user-1", messages[0].Values["delivery_id"])
|
||||
require.Equal(t, "notification", messages[0].Values["source"])
|
||||
require.Equal(t, "template", messages[0].Values["payload_mode"])
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "published", ""))
|
||||
}
|
||||
|
||||
func TestEmailPublisherRetriesMailStreamPublicationFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newEmailPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
|
||||
require.NoError(t, fixture.client.Set(context.Background(), fixture.mailStream, "wrong-type", 0).Err())
|
||||
|
||||
running := runEmailPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusFailed && route.AttemptCount == 1
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "retry", emailFailureClassificationMailStreamWrite))
|
||||
require.True(t, fixture.telemetry.hasRouteRetry("email"))
|
||||
|
||||
require.NoError(t, fixture.client.Del(context.Background(), fixture.mailStream).Err())
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished && route.AttemptCount == 2
|
||||
}, 2*time.Second, 10*time.Millisecond)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "published", ""))
|
||||
}
|
||||
|
||||
func TestEmailPublisherLeasePreventsDuplicatePublicationAcrossReplicas(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newEmailPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
|
||||
|
||||
otherPublisher, err := NewEmailPublisher(EmailPublisherConfig{
|
||||
Store: fixture.store,
|
||||
MailDeliveryCommandsStream: fixture.mailStream,
|
||||
RouteLeaseTTL: 200 * time.Millisecond,
|
||||
RouteBackoffMin: 20 * time.Millisecond,
|
||||
RouteBackoffMax: 20 * time.Millisecond,
|
||||
PollInterval: 10 * time.Millisecond,
|
||||
BatchSize: 16,
|
||||
Clock: newSteppingClock(fixture.now, time.Millisecond),
|
||||
}, testWorkerLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
first := runEmailPublisher(t, fixture.publisher)
|
||||
defer first.stop(t)
|
||||
second := runEmailPublisher(t, otherPublisher)
|
||||
defer second.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
}
|
||||
|
||||
func TestEmailPublisherDeadLettersExhaustedRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newEmailPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 6)))
|
||||
require.NoError(t, fixture.client.Set(context.Background(), fixture.mailStream, "wrong-type", 0).Err())
|
||||
|
||||
running := runEmailPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusDeadLetter && route.AttemptCount == 7
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
deadLetterPayload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.DeadLetter("1775121700000-0", "email:user:user-1")).Bytes()
|
||||
require.NoError(t, err)
|
||||
deadLetter, err := redisstate.UnmarshalDeadLetter(deadLetterPayload)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, emailFailureClassificationMailStreamWrite, deadLetter.FailureClassification)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "dead_letter", emailFailureClassificationMailStreamWrite))
|
||||
require.True(t, fixture.telemetry.hasRouteDeadLetter("email", emailFailureClassificationMailStreamWrite))
|
||||
}
|
||||
|
||||
type emailPublisherFixture struct {
|
||||
client *redis.Client
|
||||
store *redisstate.AcceptanceStore
|
||||
publisher *EmailPublisher
|
||||
mailStream string
|
||||
now time.Time
|
||||
clock *steppingClock
|
||||
telemetry *recordingWorkerTelemetry
|
||||
}
|
||||
|
||||
func newEmailPublisherFixture(t *testing.T) emailPublisherFixture {
|
||||
t.Helper()
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{
|
||||
Addr: server.Addr(),
|
||||
Protocol: 2,
|
||||
DisableIdentity: true,
|
||||
})
|
||||
t.Cleanup(func() {
|
||||
require.NoError(t, client.Close())
|
||||
})
|
||||
|
||||
store, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
|
||||
RecordTTL: 24 * time.Hour,
|
||||
DeadLetterTTL: 72 * time.Hour,
|
||||
IdempotencyTTL: 7 * 24 * time.Hour,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
now := time.UnixMilli(1775121700000).UTC()
|
||||
clock := newSteppingClock(now, time.Millisecond)
|
||||
telemetry := &recordingWorkerTelemetry{}
|
||||
publisher, err := NewEmailPublisher(EmailPublisherConfig{
|
||||
Store: store,
|
||||
MailDeliveryCommandsStream: "mail:delivery_commands",
|
||||
RouteLeaseTTL: 200 * time.Millisecond,
|
||||
RouteBackoffMin: 20 * time.Millisecond,
|
||||
RouteBackoffMax: 20 * time.Millisecond,
|
||||
PollInterval: 10 * time.Millisecond,
|
||||
BatchSize: 16,
|
||||
Telemetry: telemetry,
|
||||
Clock: clock,
|
||||
}, testWorkerLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
return emailPublisherFixture{
|
||||
client: client,
|
||||
store: store,
|
||||
publisher: publisher,
|
||||
mailStream: "mail:delivery_commands",
|
||||
now: now,
|
||||
clock: clock,
|
||||
telemetry: telemetry,
|
||||
}
|
||||
}
|
||||
|
||||
func validEmailAcceptanceInput(now time.Time, emailAttemptCount int) acceptintent.CreateAcceptanceInput {
|
||||
input := validPushAcceptanceInput(now)
|
||||
for index := range input.Routes {
|
||||
if input.Routes[index].RouteID != "email:user:user-1" {
|
||||
continue
|
||||
}
|
||||
input.Routes[index].AttemptCount = emailAttemptCount
|
||||
input.Routes[index].MaxAttempts = 7
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
type runningEmailPublisher struct {
|
||||
cancel context.CancelFunc
|
||||
resultCh chan error
|
||||
}
|
||||
|
||||
func runEmailPublisher(t *testing.T, publisher *EmailPublisher) runningEmailPublisher {
|
||||
t.Helper()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
resultCh := make(chan error, 1)
|
||||
go func() {
|
||||
resultCh <- publisher.Run(ctx)
|
||||
}()
|
||||
|
||||
return runningEmailPublisher{
|
||||
cancel: cancel,
|
||||
resultCh: resultCh,
|
||||
}
|
||||
}
|
||||
|
||||
func (r runningEmailPublisher) stop(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
r.cancel()
|
||||
|
||||
select {
|
||||
case err := <-r.resultCh:
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "email publisher did not stop")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/notification/internal/api/intentstream"
|
||||
"galaxy/notification/internal/logging"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
"galaxy/notification/internal/service/malformedintent"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// AcceptIntentUseCase accepts one normalized notification intent.
|
||||
type AcceptIntentUseCase interface {
|
||||
// Execute durably accepts one normalized notification intent.
|
||||
Execute(context.Context, acceptintent.AcceptInput) (acceptintent.Result, error)
|
||||
}
|
||||
|
||||
// MalformedIntentRecorder stores one operator-visible malformed-intent record.
|
||||
type MalformedIntentRecorder interface {
|
||||
// Record persists entry idempotently by stream entry id.
|
||||
Record(context.Context, malformedintent.Entry) error
|
||||
}
|
||||
|
||||
// StreamOffsetStore stores the last durably processed entry id of one plain
|
||||
// XREAD consumer.
|
||||
type StreamOffsetStore interface {
|
||||
// Load returns the last processed entry id for stream when one is stored.
|
||||
Load(context.Context, string) (string, bool, error)
|
||||
|
||||
// Save stores the last processed entry id for stream.
|
||||
Save(context.Context, string, string) error
|
||||
}
|
||||
|
||||
// IntentConsumerTelemetry records low-cardinality stream-consumer events.
|
||||
type IntentConsumerTelemetry interface {
|
||||
// RecordMalformedIntent records one malformed or rejected notification
|
||||
// intent.
|
||||
RecordMalformedIntent(context.Context, string, string, string)
|
||||
}
|
||||
|
||||
// Clock provides the current wall-clock time.
|
||||
type Clock interface {
|
||||
// Now returns the current time.
|
||||
Now() time.Time
|
||||
}
|
||||
|
||||
type systemClock struct{}
|
||||
|
||||
func (systemClock) Now() time.Time {
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
// IntentConsumerConfig stores the dependencies used by IntentConsumer.
|
||||
type IntentConsumerConfig struct {
|
||||
// Client stores the Redis client used for XREAD.
|
||||
Client *redis.Client
|
||||
|
||||
// Stream stores the Redis Stream name to consume.
|
||||
Stream string
|
||||
|
||||
// BlockTimeout stores the blocking XREAD timeout.
|
||||
BlockTimeout time.Duration
|
||||
|
||||
// Acceptor durably accepts valid notification intents.
|
||||
Acceptor AcceptIntentUseCase
|
||||
|
||||
// MalformedRecorder persists operator-visible malformed-intent entries.
|
||||
MalformedRecorder MalformedIntentRecorder
|
||||
|
||||
// OffsetStore stores the last durably processed stream entry id.
|
||||
OffsetStore StreamOffsetStore
|
||||
|
||||
// Telemetry records malformed-intent counters.
|
||||
Telemetry IntentConsumerTelemetry
|
||||
|
||||
// Clock provides wall-clock timestamps for malformed-intent records.
|
||||
Clock Clock
|
||||
}
|
||||
|
||||
// IntentConsumer stores the Redis Streams consumer used for notification
|
||||
// intent intake.
|
||||
type IntentConsumer struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
blockTimeout time.Duration
|
||||
acceptor AcceptIntentUseCase
|
||||
malformedRecorder MalformedIntentRecorder
|
||||
offsetStore StreamOffsetStore
|
||||
telemetry IntentConsumerTelemetry
|
||||
clock Clock
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewIntentConsumer constructs the notification-intent consumer.
|
||||
func NewIntentConsumer(cfg IntentConsumerConfig, logger *slog.Logger) (*IntentConsumer, error) {
|
||||
switch {
|
||||
case cfg.Client == nil:
|
||||
return nil, errors.New("new intent consumer: nil redis client")
|
||||
case strings.TrimSpace(cfg.Stream) == "":
|
||||
return nil, errors.New("new intent consumer: stream must not be empty")
|
||||
case cfg.BlockTimeout <= 0:
|
||||
return nil, errors.New("new intent consumer: block timeout must be positive")
|
||||
case cfg.Acceptor == nil:
|
||||
return nil, errors.New("new intent consumer: nil acceptor")
|
||||
case cfg.MalformedRecorder == nil:
|
||||
return nil, errors.New("new intent consumer: nil malformed recorder")
|
||||
case cfg.OffsetStore == nil:
|
||||
return nil, errors.New("new intent consumer: nil offset store")
|
||||
}
|
||||
if cfg.Clock == nil {
|
||||
cfg.Clock = systemClock{}
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &IntentConsumer{
|
||||
client: cfg.Client,
|
||||
stream: cfg.Stream,
|
||||
blockTimeout: cfg.BlockTimeout,
|
||||
acceptor: cfg.Acceptor,
|
||||
malformedRecorder: cfg.MalformedRecorder,
|
||||
offsetStore: cfg.OffsetStore,
|
||||
telemetry: cfg.Telemetry,
|
||||
clock: cfg.Clock,
|
||||
logger: logger.With("component", "intent_consumer", "stream", cfg.Stream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run starts the intent consumer and blocks until ctx is canceled or Redis
|
||||
// returns an unexpected error.
|
||||
func (consumer *IntentConsumer) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run intent consumer: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if consumer == nil || consumer.client == nil {
|
||||
return errors.New("run intent consumer: nil consumer")
|
||||
}
|
||||
|
||||
lastID, found, err := consumer.offsetStore.Load(ctx, consumer.stream)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run intent consumer: load stream offset: %w", err)
|
||||
}
|
||||
if !found {
|
||||
lastID = "0-0"
|
||||
}
|
||||
|
||||
consumer.logger.Info("intent consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID)
|
||||
|
||||
for {
|
||||
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
|
||||
Streams: []string{consumer.stream, lastID},
|
||||
Count: 1,
|
||||
Block: consumer.blockTimeout,
|
||||
}).Result()
|
||||
switch {
|
||||
case err == nil:
|
||||
for _, stream := range streams {
|
||||
for _, message := range stream.Messages {
|
||||
if err := consumer.handleMessage(ctx, message); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := consumer.offsetStore.Save(ctx, consumer.stream, message.ID); err != nil {
|
||||
return fmt.Errorf("run intent consumer: save stream offset: %w", err)
|
||||
}
|
||||
lastID = message.ID
|
||||
}
|
||||
}
|
||||
case errors.Is(err, redis.Nil):
|
||||
continue
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
|
||||
consumer.logger.Info("intent consumer stopped")
|
||||
return ctx.Err()
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
|
||||
return fmt.Errorf("run intent consumer: %w", err)
|
||||
default:
|
||||
return fmt.Errorf("run intent consumer: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (consumer *IntentConsumer) handleMessage(ctx context.Context, message redis.XMessage) error {
|
||||
rawFields := cloneRawFields(message.Values)
|
||||
|
||||
intent, err := intentstream.DecodeIntent(rawFields)
|
||||
if err != nil {
|
||||
return consumer.recordMalformed(
|
||||
ctx,
|
||||
message.ID,
|
||||
rawFields,
|
||||
intentstream.ClassifyDecodeError(err),
|
||||
err,
|
||||
)
|
||||
}
|
||||
|
||||
result, err := consumer.acceptor.Execute(ctx, acceptintent.AcceptInput{
|
||||
NotificationID: message.ID,
|
||||
Intent: intent,
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
logArgs := []any{
|
||||
"stream_entry_id", message.ID,
|
||||
"notification_id", message.ID,
|
||||
}
|
||||
logArgs = append(logArgs, logging.IntentAttrs(intent)...)
|
||||
logArgs = append(logArgs,
|
||||
"outcome", string(result.Outcome),
|
||||
)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
consumer.logger.Info("notification intent handled", logArgs...)
|
||||
return nil
|
||||
case errors.Is(err, acceptintent.ErrConflict):
|
||||
return consumer.recordMalformed(ctx, message.ID, rawFields, malformedintent.FailureCodeIdempotencyConflict, err)
|
||||
case errors.Is(err, acceptintent.ErrRecipientNotFound):
|
||||
return consumer.recordMalformed(ctx, message.ID, rawFields, malformedintent.FailureCodeRecipientNotFound, err)
|
||||
case errors.Is(err, acceptintent.ErrServiceUnavailable):
|
||||
return fmt.Errorf("handle intent %q: %w", message.ID, err)
|
||||
default:
|
||||
return fmt.Errorf("handle intent %q: %w", message.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (consumer *IntentConsumer) recordMalformed(
|
||||
ctx context.Context,
|
||||
streamEntryID string,
|
||||
rawFields map[string]any,
|
||||
failureCode malformedintent.FailureCode,
|
||||
cause error,
|
||||
) error {
|
||||
entry := malformedintent.Entry{
|
||||
StreamEntryID: streamEntryID,
|
||||
NotificationType: optionalRawString(rawFields, "notification_type"),
|
||||
Producer: optionalRawString(rawFields, "producer"),
|
||||
IdempotencyKey: optionalRawString(rawFields, "idempotency_key"),
|
||||
FailureCode: failureCode,
|
||||
FailureMessage: strings.TrimSpace(cause.Error()),
|
||||
RawFields: cloneRawFields(rawFields),
|
||||
RecordedAt: consumer.clock.Now().UTC().Truncate(time.Millisecond),
|
||||
}
|
||||
if err := consumer.malformedRecorder.Record(ctx, entry); err != nil {
|
||||
return fmt.Errorf("record malformed intent %q: %w", streamEntryID, err)
|
||||
}
|
||||
if consumer.telemetry != nil {
|
||||
consumer.telemetry.RecordMalformedIntent(ctx, string(failureCode), entry.NotificationType, entry.Producer)
|
||||
}
|
||||
|
||||
logArgs := []any{
|
||||
"stream_entry_id", streamEntryID,
|
||||
"notification_type", entry.NotificationType,
|
||||
"producer", entry.Producer,
|
||||
"idempotency_key", entry.IdempotencyKey,
|
||||
"failure_code", string(entry.FailureCode),
|
||||
"failure_message", entry.FailureMessage,
|
||||
}
|
||||
if traceID := optionalRawString(rawFields, "trace_id"); traceID != "" {
|
||||
logArgs = append(logArgs, "trace_id", traceID)
|
||||
}
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
consumer.logger.Warn("notification intent rejected", logArgs...)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func cloneRawFields(values map[string]any) map[string]any {
|
||||
if values == nil {
|
||||
return map[string]any{}
|
||||
}
|
||||
|
||||
cloned := make(map[string]any, len(values))
|
||||
for key, value := range values {
|
||||
cloned[key] = cloneRawValue(value)
|
||||
}
|
||||
|
||||
return cloned
|
||||
}
|
||||
|
||||
func cloneRawValue(value any) any {
|
||||
switch typed := value.(type) {
|
||||
case map[string]any:
|
||||
return cloneRawFields(typed)
|
||||
case []any:
|
||||
cloned := make([]any, len(typed))
|
||||
for index, item := range typed {
|
||||
cloned[index] = cloneRawValue(item)
|
||||
}
|
||||
return cloned
|
||||
default:
|
||||
return typed
|
||||
}
|
||||
}
|
||||
|
||||
func optionalRawString(values map[string]any, key string) string {
|
||||
raw, ok := values[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch typed := raw.(type) {
|
||||
case string:
|
||||
return typed
|
||||
case []byte:
|
||||
return string(typed)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown stops the intent consumer within ctx. The consumer relies on
|
||||
// context cancellation and a bounded block timeout, so it has no dedicated
|
||||
// resources to release here.
|
||||
func (consumer *IntentConsumer) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown intent consumer: nil context")
|
||||
}
|
||||
if consumer == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,422 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
redisstate "galaxy/notification/internal/adapters/redisstate"
|
||||
"galaxy/notification/internal/config"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
"galaxy/notification/internal/service/malformedintent"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestIntentConsumerStartsFromZeroOffsetWhenNoStoredOffsetExists(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
records: map[string]acceptintent.UserRecord{
|
||||
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
|
||||
},
|
||||
})
|
||||
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), messageID)
|
||||
return err == nil && found
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
}
|
||||
|
||||
func TestIntentConsumerContinuesFromSavedOffsetAfterRestart(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
records: map[string]acceptintent.UserRecord{
|
||||
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
|
||||
},
|
||||
})
|
||||
firstID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
require.NoError(t, fixture.offsetStore.Save(context.Background(), fixture.stream, firstID))
|
||||
secondID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), secondID)
|
||||
return err == nil && found
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), firstID)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
}
|
||||
|
||||
func TestIntentConsumerRecordsIdempotencyConflictsAndAdvancesOffset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
records: map[string]acceptintent.UserRecord{
|
||||
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
|
||||
},
|
||||
})
|
||||
firstID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
secondID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(secondID)).Bytes()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
entry, err := redisstate.UnmarshalMalformedIntent(payload)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return entry.FailureCode == "idempotency_conflict"
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
require.Equal(t, secondID, offset)
|
||||
|
||||
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), firstID)
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
|
||||
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), secondID)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
}
|
||||
|
||||
func TestIntentConsumerShutdownInterruptsBlockingRead(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
resultCh := make(chan error, 1)
|
||||
go func() {
|
||||
resultCh <- fixture.consumer.Run(ctx)
|
||||
}()
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
select {
|
||||
case err := <-resultCh:
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "intent consumer did not stop after shutdown")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIntentConsumerRecordsRecipientNotFoundAndAdvancesOffset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{})
|
||||
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(messageID)).Bytes()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
entry, err := redisstate.UnmarshalMalformedIntent(payload)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return entry.FailureCode == malformedintent.FailureCodeRecipientNotFound
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
require.Equal(t, messageID, offset)
|
||||
|
||||
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
}
|
||||
|
||||
func TestIntentConsumerRecordsMalformedIntentAndAdvancesOffset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
records: map[string]acceptintent.UserRecord{
|
||||
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
|
||||
},
|
||||
})
|
||||
messageID, err := fixture.client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: fixture.stream,
|
||||
Values: map[string]any{
|
||||
"notification_type": "game.turn.ready",
|
||||
"producer": "game_master",
|
||||
"audience_kind": "user",
|
||||
"recipient_user_ids_json": `["user-1"]`,
|
||||
"idempotency_key": "game-123:turn-ready",
|
||||
"occurred_at_ms": "1775121700000",
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(messageID)).Bytes()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
entry, err := redisstate.UnmarshalMalformedIntent(payload)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return entry.FailureCode == malformedintent.FailureCodeInvalidPayload &&
|
||||
entry.StreamEntryID == messageID
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
require.Equal(t, messageID, offset)
|
||||
|
||||
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
}
|
||||
|
||||
func TestIntentConsumerRecordsTelemetryForOutcomesAndMalformedIntents(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
records: map[string]acceptintent.UserRecord{
|
||||
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
|
||||
},
|
||||
})
|
||||
addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
conflictID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
running := runIntentConsumer(t, fixture.consumer)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(conflictID)).Bytes()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
entry, err := redisstate.UnmarshalMalformedIntent(payload)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return entry.FailureCode == malformedintent.FailureCodeIdempotencyConflict
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return fixture.telemetry.hasIntentOutcome("accepted") &&
|
||||
fixture.telemetry.hasIntentOutcome("duplicate") &&
|
||||
fixture.telemetry.hasMalformedIntent("idempotency_conflict")
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
}
|
||||
|
||||
func TestIntentConsumerStopsWithoutAdvancingOffsetWhenUserDirectoryIsUnavailable(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newIntentConsumerFixture(t, stubUserDirectory{
|
||||
err: errors.New("user service unavailable"),
|
||||
})
|
||||
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
resultCh := make(chan error, 1)
|
||||
go func() {
|
||||
resultCh <- fixture.consumer.Run(ctx)
|
||||
}()
|
||||
|
||||
var runErr error
|
||||
require.Eventually(t, func() bool {
|
||||
select {
|
||||
case runErr = <-resultCh:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
require.Error(t, runErr)
|
||||
require.ErrorContains(t, runErr, "user service unavailable")
|
||||
|
||||
_, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
|
||||
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
|
||||
require.NoError(t, err)
|
||||
require.False(t, found)
|
||||
}
|
||||
|
||||
type intentConsumerFixture struct {
|
||||
client *redis.Client
|
||||
stream string
|
||||
acceptanceStore *redisstate.AcceptanceStore
|
||||
offsetStore *redisstate.StreamOffsetStore
|
||||
consumer *IntentConsumer
|
||||
telemetry *recordingWorkerTelemetry
|
||||
}
|
||||
|
||||
func newIntentConsumerFixture(t *testing.T, userDirectory acceptintent.UserDirectory) intentConsumerFixture {
|
||||
t.Helper()
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{
|
||||
Addr: server.Addr(),
|
||||
Protocol: 2,
|
||||
DisableIdentity: true,
|
||||
})
|
||||
t.Cleanup(func() {
|
||||
assert.NoError(t, client.Close())
|
||||
})
|
||||
|
||||
acceptanceStore, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
|
||||
RecordTTL: 24 * time.Hour,
|
||||
DeadLetterTTL: 72 * time.Hour,
|
||||
IdempotencyTTL: 7 * 24 * time.Hour,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
malformedStore, err := redisstate.NewMalformedIntentStore(client, 72*time.Hour)
|
||||
require.NoError(t, err)
|
||||
offsetStore, err := redisstate.NewStreamOffsetStore(client)
|
||||
require.NoError(t, err)
|
||||
telemetry := &recordingWorkerTelemetry{}
|
||||
service, err := acceptintent.New(acceptintent.Config{
|
||||
Store: acceptanceStore,
|
||||
UserDirectory: userDirectory,
|
||||
Clock: fixedClock{now: time.UnixMilli(1775121700000).UTC()},
|
||||
Logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
||||
Telemetry: telemetry,
|
||||
PushMaxAttempts: 3,
|
||||
EmailMaxAttempts: 7,
|
||||
IdempotencyTTL: 7 * 24 * time.Hour,
|
||||
AdminRouting: config.AdminRoutingConfig{},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
consumer, err := NewIntentConsumer(IntentConsumerConfig{
|
||||
Client: client,
|
||||
Stream: "notification:intents",
|
||||
BlockTimeout: 25 * time.Millisecond,
|
||||
Acceptor: service,
|
||||
MalformedRecorder: malformedStore,
|
||||
OffsetStore: offsetStore,
|
||||
Telemetry: telemetry,
|
||||
Clock: fixedClock{now: time.UnixMilli(1775121700001).UTC()},
|
||||
}, slog.New(slog.NewTextHandler(io.Discard, nil)))
|
||||
require.NoError(t, err)
|
||||
|
||||
return intentConsumerFixture{
|
||||
client: client,
|
||||
stream: "notification:intents",
|
||||
acceptanceStore: acceptanceStore,
|
||||
offsetStore: offsetStore,
|
||||
consumer: consumer,
|
||||
telemetry: telemetry,
|
||||
}
|
||||
}
|
||||
|
||||
func addValidIntent(t *testing.T, client *redis.Client, stream string, payloadJSON string) string {
|
||||
t.Helper()
|
||||
|
||||
messageID, err := client.XAdd(context.Background(), &redis.XAddArgs{
|
||||
Stream: stream,
|
||||
Values: map[string]any{
|
||||
"notification_type": "game.turn.ready",
|
||||
"producer": "game_master",
|
||||
"audience_kind": "user",
|
||||
"recipient_user_ids_json": `["user-1"]`,
|
||||
"idempotency_key": "game-123:turn-ready",
|
||||
"occurred_at_ms": "1775121700000",
|
||||
"payload_json": payloadJSON,
|
||||
},
|
||||
}).Result()
|
||||
require.NoError(t, err)
|
||||
|
||||
return messageID
|
||||
}
|
||||
|
||||
type runningIntentConsumer struct {
|
||||
cancel context.CancelFunc
|
||||
resultCh chan error
|
||||
}
|
||||
|
||||
func runIntentConsumer(t *testing.T, consumer *IntentConsumer) runningIntentConsumer {
|
||||
t.Helper()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
resultCh := make(chan error, 1)
|
||||
go func() {
|
||||
resultCh <- consumer.Run(ctx)
|
||||
}()
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
return runningIntentConsumer{
|
||||
cancel: cancel,
|
||||
resultCh: resultCh,
|
||||
}
|
||||
}
|
||||
|
||||
func (r runningIntentConsumer) stop(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
r.cancel()
|
||||
|
||||
select {
|
||||
case err := <-r.resultCh:
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "intent consumer did not stop")
|
||||
}
|
||||
}
|
||||
|
||||
type fixedClock struct {
|
||||
now time.Time
|
||||
}
|
||||
|
||||
func (clock fixedClock) Now() time.Time {
|
||||
return clock.now
|
||||
}
|
||||
|
||||
type stubUserDirectory struct {
|
||||
records map[string]acceptintent.UserRecord
|
||||
err error
|
||||
}
|
||||
|
||||
func (directory stubUserDirectory) GetUserByID(_ context.Context, userID string) (acceptintent.UserRecord, error) {
|
||||
if directory.err != nil {
|
||||
return acceptintent.UserRecord{}, directory.err
|
||||
}
|
||||
if record, ok := directory.records[userID]; ok {
|
||||
return record, nil
|
||||
}
|
||||
|
||||
return acceptintent.UserRecord{}, acceptintent.ErrRecipientNotFound
|
||||
}
|
||||
@@ -0,0 +1,499 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/notification/internal/adapters/redisstate"
|
||||
"galaxy/notification/internal/api/intentstream"
|
||||
"galaxy/notification/internal/logging"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
"galaxy/notification/internal/service/publishpush"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultPushPublisherPollInterval = 100 * time.Millisecond
|
||||
defaultPushPublisherBatchSize = 64
|
||||
|
||||
pushFailureClassificationPayloadEncoding = "payload_encoding_failed"
|
||||
pushFailureClassificationGatewayStreamWrite = "gateway_stream_publish_failed"
|
||||
)
|
||||
|
||||
// PushRouteStateStore describes the durable route-state operations required by
|
||||
// PushPublisher.
|
||||
type PushRouteStateStore interface {
|
||||
// ListDueRoutes loads due scheduled routes.
|
||||
ListDueRoutes(context.Context, time.Time, int64) ([]redisstate.ScheduledRoute, error)
|
||||
|
||||
// TryAcquireRouteLease attempts to acquire one temporary route lease.
|
||||
TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error)
|
||||
|
||||
// ReleaseRouteLease best-effort releases one temporary route lease.
|
||||
ReleaseRouteLease(context.Context, string, string, string) error
|
||||
|
||||
// GetNotification loads one accepted notification.
|
||||
GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error)
|
||||
|
||||
// GetRoute loads one accepted notification route.
|
||||
GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error)
|
||||
|
||||
// CompleteRoutePublished records one successful publication.
|
||||
CompleteRoutePublished(context.Context, redisstate.CompleteRoutePublishedInput) error
|
||||
|
||||
// CompleteRouteFailed records one retryable publication failure.
|
||||
CompleteRouteFailed(context.Context, redisstate.CompleteRouteFailedInput) error
|
||||
|
||||
// CompleteRouteDeadLetter records one exhausted publication failure.
|
||||
CompleteRouteDeadLetter(context.Context, redisstate.CompleteRouteDeadLetterInput) error
|
||||
}
|
||||
|
||||
// PushEventEncoder encodes one push-capable notification route into a
|
||||
// Gateway-compatible client event.
|
||||
type PushEventEncoder interface {
|
||||
// Encode converts notification plus route to one outbound event.
|
||||
Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishpush.Event, error)
|
||||
}
|
||||
|
||||
// RoutePublisherTelemetry records low-cardinality route publication outcomes.
|
||||
type RoutePublisherTelemetry interface {
|
||||
// RecordRoutePublishAttempt records one route publication attempt outcome.
|
||||
RecordRoutePublishAttempt(context.Context, string, string, string, string)
|
||||
|
||||
// RecordRouteRetry records one route retry scheduling event.
|
||||
RecordRouteRetry(context.Context, string, string)
|
||||
|
||||
// RecordRouteDeadLetter records one route transition to dead_letter.
|
||||
RecordRouteDeadLetter(context.Context, string, string, string)
|
||||
}
|
||||
|
||||
// PushPublisherConfig stores the dependencies and policies used by
|
||||
// PushPublisher.
|
||||
type PushPublisherConfig struct {
|
||||
// Store owns the durable route-state transitions.
|
||||
Store PushRouteStateStore
|
||||
|
||||
// GatewayStream stores the outbound Gateway client-events stream name.
|
||||
GatewayStream string
|
||||
|
||||
// GatewayStreamMaxLen bounds GatewayStream with approximate trimming.
|
||||
GatewayStreamMaxLen int64
|
||||
|
||||
// RouteLeaseTTL stores the temporary route-lease lifetime.
|
||||
RouteLeaseTTL time.Duration
|
||||
|
||||
// RouteBackoffMin stores the minimum retry backoff.
|
||||
RouteBackoffMin time.Duration
|
||||
|
||||
// RouteBackoffMax stores the maximum retry backoff.
|
||||
RouteBackoffMax time.Duration
|
||||
|
||||
// PollInterval stores how long the worker waits before the next due-route
|
||||
// scan when no progress was made.
|
||||
PollInterval time.Duration
|
||||
|
||||
// BatchSize stores the maximum number of due schedule members loaded per
|
||||
// scan.
|
||||
BatchSize int64
|
||||
|
||||
// Encoder stores the push payload encoder.
|
||||
Encoder PushEventEncoder
|
||||
|
||||
// Telemetry records route publication counters.
|
||||
Telemetry RoutePublisherTelemetry
|
||||
|
||||
// Clock provides wall-clock timestamps.
|
||||
Clock Clock
|
||||
}
|
||||
|
||||
// PushPublisher publishes due push routes into the Gateway client-events
|
||||
// stream with retry and dead-letter handling.
|
||||
type PushPublisher struct {
|
||||
store PushRouteStateStore
|
||||
gatewayStream string
|
||||
gatewayStreamMaxLen int64
|
||||
routeLeaseTTL time.Duration
|
||||
routeBackoffMin time.Duration
|
||||
routeBackoffMax time.Duration
|
||||
pollInterval time.Duration
|
||||
batchSize int64
|
||||
encoder PushEventEncoder
|
||||
telemetry RoutePublisherTelemetry
|
||||
clock Clock
|
||||
workerToken string
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewPushPublisher constructs the push publication worker.
|
||||
func NewPushPublisher(cfg PushPublisherConfig, logger *slog.Logger) (*PushPublisher, error) {
|
||||
switch {
|
||||
case cfg.Store == nil:
|
||||
return nil, errors.New("new push publisher: nil store")
|
||||
case strings.TrimSpace(cfg.GatewayStream) == "":
|
||||
return nil, errors.New("new push publisher: gateway stream must not be empty")
|
||||
case cfg.GatewayStreamMaxLen <= 0:
|
||||
return nil, errors.New("new push publisher: gateway stream max len must be positive")
|
||||
case cfg.RouteLeaseTTL <= 0:
|
||||
return nil, errors.New("new push publisher: route lease ttl must be positive")
|
||||
case cfg.RouteBackoffMin <= 0:
|
||||
return nil, errors.New("new push publisher: route backoff min must be positive")
|
||||
case cfg.RouteBackoffMax <= 0:
|
||||
return nil, errors.New("new push publisher: route backoff max must be positive")
|
||||
case cfg.RouteBackoffMin > cfg.RouteBackoffMax:
|
||||
return nil, errors.New("new push publisher: route backoff min must not exceed route backoff max")
|
||||
}
|
||||
if cfg.PollInterval <= 0 {
|
||||
cfg.PollInterval = defaultPushPublisherPollInterval
|
||||
}
|
||||
if cfg.BatchSize <= 0 {
|
||||
cfg.BatchSize = defaultPushPublisherBatchSize
|
||||
}
|
||||
if cfg.Clock == nil {
|
||||
cfg.Clock = systemClock{}
|
||||
}
|
||||
if cfg.Encoder == nil {
|
||||
cfg.Encoder = publishpush.Encoder{}
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
workerToken, err := newWorkerToken()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new push publisher: %w", err)
|
||||
}
|
||||
|
||||
return &PushPublisher{
|
||||
store: cfg.Store,
|
||||
gatewayStream: cfg.GatewayStream,
|
||||
gatewayStreamMaxLen: cfg.GatewayStreamMaxLen,
|
||||
routeLeaseTTL: cfg.RouteLeaseTTL,
|
||||
routeBackoffMin: cfg.RouteBackoffMin,
|
||||
routeBackoffMax: cfg.RouteBackoffMax,
|
||||
pollInterval: cfg.PollInterval,
|
||||
batchSize: cfg.BatchSize,
|
||||
encoder: cfg.Encoder,
|
||||
telemetry: cfg.Telemetry,
|
||||
clock: cfg.Clock,
|
||||
workerToken: workerToken,
|
||||
logger: logger.With("component", "push_publisher", "stream", cfg.GatewayStream),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run starts the push publication loop and blocks until ctx is canceled or an
|
||||
// unexpected publication error occurs.
|
||||
func (publisher *PushPublisher) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run push publisher: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if publisher == nil {
|
||||
return errors.New("run push publisher: nil publisher")
|
||||
}
|
||||
|
||||
publisher.logger.Info("push publisher started",
|
||||
"poll_interval", publisher.pollInterval.String(),
|
||||
"batch_size", publisher.batchSize,
|
||||
)
|
||||
|
||||
for {
|
||||
progress, err := publisher.publishDueRoutes(ctx)
|
||||
switch {
|
||||
case err == nil && progress:
|
||||
continue
|
||||
case err == nil:
|
||||
if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil {
|
||||
publisher.logger.Info("push publisher stopped")
|
||||
return waitErr
|
||||
}
|
||||
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)):
|
||||
publisher.logger.Info("push publisher stopped")
|
||||
return ctx.Err()
|
||||
default:
|
||||
return fmt.Errorf("run push publisher: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown stops the push publisher within ctx. The worker relies on context
|
||||
// cancellation and a bounded polling interval, so it has no dedicated
|
||||
// resources to release here.
|
||||
func (publisher *PushPublisher) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown push publisher: nil context")
|
||||
}
|
||||
if publisher == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) publishDueRoutes(ctx context.Context) (bool, error) {
|
||||
now := publisher.now()
|
||||
|
||||
dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
progress := false
|
||||
for _, dueRoute := range dueRoutes {
|
||||
if !strings.HasPrefix(dueRoute.RouteID, "push:") {
|
||||
continue
|
||||
}
|
||||
|
||||
processed, err := publisher.publishRoute(ctx, now, dueRoute)
|
||||
if err != nil {
|
||||
return progress, err
|
||||
}
|
||||
progress = progress || processed
|
||||
}
|
||||
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute redisstate.ScheduledRoute) (bool, error) {
|
||||
acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err)
|
||||
}
|
||||
if !acquired {
|
||||
return false, nil
|
||||
}
|
||||
defer func() {
|
||||
releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL)
|
||||
defer cancel()
|
||||
_ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken)
|
||||
}()
|
||||
|
||||
notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err)
|
||||
}
|
||||
if !found {
|
||||
return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID)
|
||||
}
|
||||
|
||||
route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err)
|
||||
}
|
||||
if !found {
|
||||
return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID)
|
||||
}
|
||||
if route.Channel != intentstream.ChannelPush {
|
||||
return false, nil
|
||||
}
|
||||
switch route.Status {
|
||||
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
|
||||
default:
|
||||
return false, nil
|
||||
}
|
||||
if route.NextAttemptAt.After(now) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
event, err := publisher.encoder.Encode(notification, route)
|
||||
if err != nil {
|
||||
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationPayloadEncoding, err.Error())
|
||||
}
|
||||
|
||||
err = publisher.store.CompleteRoutePublished(ctx, redisstate.CompleteRoutePublishedInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
PublishedAt: publisher.now(),
|
||||
Stream: publisher.gatewayStream,
|
||||
StreamMaxLen: publisher.gatewayStreamMaxLen,
|
||||
StreamValues: eventValues(event),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "published", "")
|
||||
logArgs := logging.RouteAttrs(
|
||||
notification.NotificationID,
|
||||
notification.NotificationType,
|
||||
notification.Producer,
|
||||
notification.AudienceKind,
|
||||
notification.IdempotencyKey,
|
||||
notification.RequestID,
|
||||
notification.TraceID,
|
||||
route.RouteID,
|
||||
route.Channel,
|
||||
)
|
||||
logArgs = append(logArgs,
|
||||
"event_id", event.EventID,
|
||||
"user_id", event.UserID,
|
||||
)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
publisher.logger.Info("push route published", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationGatewayStreamWrite, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) recordFailure(
|
||||
ctx context.Context,
|
||||
notification acceptintent.NotificationRecord,
|
||||
route acceptintent.NotificationRoute,
|
||||
classification string,
|
||||
message string,
|
||||
) (bool, error) {
|
||||
failureAt := publisher.now()
|
||||
attemptNumber := route.AttemptCount + 1
|
||||
logArgs := logging.RouteAttrs(
|
||||
notification.NotificationID,
|
||||
notification.NotificationType,
|
||||
notification.Producer,
|
||||
notification.AudienceKind,
|
||||
notification.IdempotencyKey,
|
||||
notification.RequestID,
|
||||
notification.TraceID,
|
||||
route.RouteID,
|
||||
route.Channel,
|
||||
)
|
||||
logArgs = append(logArgs,
|
||||
"failure_classification", classification,
|
||||
"failure_message", strings.TrimSpace(message),
|
||||
"attempt_number", attemptNumber,
|
||||
"max_attempts", route.MaxAttempts,
|
||||
)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
|
||||
if attemptNumber >= route.MaxAttempts {
|
||||
err := publisher.store.CompleteRouteDeadLetter(ctx, redisstate.CompleteRouteDeadLetterInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
DeadLetteredAt: failureAt,
|
||||
FailureClassification: classification,
|
||||
FailureMessage: strings.TrimSpace(message),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification)
|
||||
publisher.recordRouteDeadLetter(ctx, notification, route, classification)
|
||||
publisher.logger.Warn("push route dead-lettered", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err)
|
||||
}
|
||||
}
|
||||
|
||||
nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond)
|
||||
err := publisher.store.CompleteRouteFailed(ctx, redisstate.CompleteRouteFailedInput{
|
||||
ExpectedRoute: route,
|
||||
LeaseToken: publisher.workerToken,
|
||||
FailedAt: failureAt,
|
||||
NextAttemptAt: nextAttemptAt,
|
||||
FailureClassification: classification,
|
||||
FailureMessage: strings.TrimSpace(message),
|
||||
})
|
||||
switch {
|
||||
case err == nil:
|
||||
publisher.recordPublishAttempt(ctx, notification, route, "retry", classification)
|
||||
publisher.recordRouteRetry(ctx, notification, route)
|
||||
logArgs = append(logArgs, "next_attempt_at", nextAttemptAt)
|
||||
publisher.logger.Warn("push route failed and was rescheduled", logArgs...)
|
||||
return true, nil
|
||||
case errors.Is(err, redisstate.ErrConflict):
|
||||
return false, nil
|
||||
default:
|
||||
return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func eventValues(event publishpush.Event) map[string]any {
|
||||
values := map[string]any{
|
||||
"user_id": event.UserID,
|
||||
"event_type": event.EventType,
|
||||
"event_id": event.EventID,
|
||||
"payload_bytes": append([]byte(nil), event.PayloadBytes...),
|
||||
}
|
||||
if event.RequestID != "" {
|
||||
values["request_id"] = event.RequestID
|
||||
}
|
||||
if event.TraceID != "" {
|
||||
values["trace_id"] = event.TraceID
|
||||
}
|
||||
|
||||
return values
|
||||
}
|
||||
|
||||
func routeBackoffDelay(attemptNumber int, minBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
||||
delay := minBackoff
|
||||
for step := 1; step < attemptNumber; step++ {
|
||||
if delay >= maxBackoff/2 {
|
||||
return maxBackoff
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
if delay < minBackoff {
|
||||
return minBackoff
|
||||
}
|
||||
if delay > maxBackoff {
|
||||
return maxBackoff
|
||||
}
|
||||
|
||||
return delay
|
||||
}
|
||||
|
||||
func waitWithContext(ctx context.Context, delay time.Duration) error {
|
||||
timer := time.NewTimer(delay)
|
||||
defer timer.Stop()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func newWorkerToken() (string, error) {
|
||||
buffer := make([]byte, 16)
|
||||
if _, err := rand.Read(buffer); err != nil {
|
||||
return "", fmt.Errorf("generate worker token: %w", err)
|
||||
}
|
||||
|
||||
return hex.EncodeToString(buffer), nil
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) now() time.Time {
|
||||
return publisher.clock.Now().UTC().Truncate(time.Millisecond)
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification)
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType))
|
||||
}
|
||||
|
||||
func (publisher *PushPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) {
|
||||
if publisher == nil || publisher.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification)
|
||||
}
|
||||
@@ -0,0 +1,318 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
redisstate "galaxy/notification/internal/adapters/redisstate"
|
||||
"galaxy/notification/internal/api/intentstream"
|
||||
"galaxy/notification/internal/service/acceptintent"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestPushPublisherPublishesDuePushRouteAndLeavesEmailRoutePending(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newPushPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
|
||||
|
||||
running := runPushPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
emailRoute, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
|
||||
require.NoError(t, err)
|
||||
require.True(t, found)
|
||||
require.Equal(t, acceptintent.RouteStatusPending, emailRoute.Status)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
require.Equal(t, "user-1", messages[0].Values["user_id"])
|
||||
require.Equal(t, "game.turn.ready", messages[0].Values["event_type"])
|
||||
require.Equal(t, "1775121700000-0/push:user:user-1", messages[0].Values["event_id"])
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "published", ""))
|
||||
}
|
||||
|
||||
func TestPushPublisherRetriesGatewayStreamPublicationFailures(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newPushPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
|
||||
require.NoError(t, fixture.client.Set(context.Background(), fixture.gatewayStream, "wrong-type", 0).Err())
|
||||
|
||||
running := runPushPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusFailed && route.AttemptCount == 1
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "retry", pushFailureClassificationGatewayStreamWrite))
|
||||
require.True(t, fixture.telemetry.hasRouteRetry("push"))
|
||||
|
||||
require.NoError(t, fixture.client.Del(context.Background(), fixture.gatewayStream).Err())
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished && route.AttemptCount == 2
|
||||
}, 2*time.Second, 10*time.Millisecond)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "published", ""))
|
||||
}
|
||||
|
||||
func TestPushPublisherDeadLettersExhaustedRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newPushPublisherFixture(t)
|
||||
input := validPushAcceptanceInput(fixture.now)
|
||||
for index := range input.Routes {
|
||||
if input.Routes[index].RouteID == "push:user:user-1" {
|
||||
input.Routes[index].AttemptCount = 2
|
||||
input.Routes[index].MaxAttempts = 3
|
||||
}
|
||||
}
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), input))
|
||||
require.NoError(t, fixture.client.Set(context.Background(), fixture.gatewayStream, "wrong-type", 0).Err())
|
||||
|
||||
running := runPushPublisher(t, fixture.publisher)
|
||||
defer running.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusDeadLetter && route.AttemptCount == 3
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
deadLetterPayload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.DeadLetter("1775121700000-0", "push:user:user-1")).Bytes()
|
||||
require.NoError(t, err)
|
||||
deadLetter, err := redisstate.UnmarshalDeadLetter(deadLetterPayload)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, pushFailureClassificationGatewayStreamWrite, deadLetter.FailureClassification)
|
||||
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "dead_letter", pushFailureClassificationGatewayStreamWrite))
|
||||
require.True(t, fixture.telemetry.hasRouteDeadLetter("push", pushFailureClassificationGatewayStreamWrite))
|
||||
}
|
||||
|
||||
func TestPushPublisherLeasePreventsDuplicatePublicationAcrossReplicas(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fixture := newPushPublisherFixture(t)
|
||||
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
|
||||
|
||||
otherPublisher, err := NewPushPublisher(PushPublisherConfig{
|
||||
Store: fixture.store,
|
||||
GatewayStream: fixture.gatewayStream,
|
||||
GatewayStreamMaxLen: 1024,
|
||||
RouteLeaseTTL: 200 * time.Millisecond,
|
||||
RouteBackoffMin: 20 * time.Millisecond,
|
||||
RouteBackoffMax: 20 * time.Millisecond,
|
||||
PollInterval: 10 * time.Millisecond,
|
||||
BatchSize: 16,
|
||||
Clock: newSteppingClock(fixture.now, time.Millisecond),
|
||||
}, testWorkerLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
first := runPushPublisher(t, fixture.publisher)
|
||||
defer first.stop(t)
|
||||
second := runPushPublisher(t, otherPublisher)
|
||||
defer second.stop(t)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
|
||||
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
|
||||
}, time.Second, 10*time.Millisecond)
|
||||
|
||||
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
|
||||
require.NoError(t, err)
|
||||
require.Len(t, messages, 1)
|
||||
}
|
||||
|
||||
type pushPublisherFixture struct {
|
||||
client *redis.Client
|
||||
store *redisstate.AcceptanceStore
|
||||
publisher *PushPublisher
|
||||
gatewayStream string
|
||||
now time.Time
|
||||
clock *steppingClock
|
||||
telemetry *recordingWorkerTelemetry
|
||||
}
|
||||
|
||||
func newPushPublisherFixture(t *testing.T) pushPublisherFixture {
|
||||
t.Helper()
|
||||
|
||||
server := miniredis.RunT(t)
|
||||
client := redis.NewClient(&redis.Options{
|
||||
Addr: server.Addr(),
|
||||
Protocol: 2,
|
||||
DisableIdentity: true,
|
||||
})
|
||||
t.Cleanup(func() {
|
||||
assert.NoError(t, client.Close())
|
||||
})
|
||||
|
||||
store, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
|
||||
RecordTTL: 24 * time.Hour,
|
||||
DeadLetterTTL: 72 * time.Hour,
|
||||
IdempotencyTTL: 7 * 24 * time.Hour,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
now := time.UnixMilli(1775121700000).UTC()
|
||||
clock := newSteppingClock(now, time.Millisecond)
|
||||
telemetry := &recordingWorkerTelemetry{}
|
||||
publisher, err := NewPushPublisher(PushPublisherConfig{
|
||||
Store: store,
|
||||
GatewayStream: "gateway:client-events",
|
||||
GatewayStreamMaxLen: 1024,
|
||||
RouteLeaseTTL: 200 * time.Millisecond,
|
||||
RouteBackoffMin: 20 * time.Millisecond,
|
||||
RouteBackoffMax: 20 * time.Millisecond,
|
||||
PollInterval: 10 * time.Millisecond,
|
||||
BatchSize: 16,
|
||||
Telemetry: telemetry,
|
||||
Clock: clock,
|
||||
}, testWorkerLogger())
|
||||
require.NoError(t, err)
|
||||
|
||||
return pushPublisherFixture{
|
||||
client: client,
|
||||
store: store,
|
||||
publisher: publisher,
|
||||
gatewayStream: "gateway:client-events",
|
||||
now: now,
|
||||
clock: clock,
|
||||
telemetry: telemetry,
|
||||
}
|
||||
}
|
||||
|
||||
func validPushAcceptanceInput(now time.Time) acceptintent.CreateAcceptanceInput {
|
||||
return acceptintent.CreateAcceptanceInput{
|
||||
Notification: acceptintent.NotificationRecord{
|
||||
NotificationID: "1775121700000-0",
|
||||
NotificationType: intentstream.NotificationTypeGameTurnReady,
|
||||
Producer: intentstream.ProducerGameMaster,
|
||||
AudienceKind: intentstream.AudienceKindUser,
|
||||
RecipientUserIDs: []string{"user-1"},
|
||||
PayloadJSON: `{"game_id":"game-123","game_name":"Nebula Clash","turn_number":54}`,
|
||||
IdempotencyKey: "game-123:turn-54",
|
||||
RequestFingerprint: "sha256:deadbeef",
|
||||
RequestID: "request-1",
|
||||
TraceID: "trace-1",
|
||||
OccurredAt: now,
|
||||
AcceptedAt: now,
|
||||
UpdatedAt: now,
|
||||
},
|
||||
Routes: []acceptintent.NotificationRoute{
|
||||
{
|
||||
NotificationID: "1775121700000-0",
|
||||
RouteID: "push:user:user-1",
|
||||
Channel: intentstream.ChannelPush,
|
||||
RecipientRef: "user:user-1",
|
||||
Status: acceptintent.RouteStatusPending,
|
||||
AttemptCount: 0,
|
||||
MaxAttempts: 3,
|
||||
NextAttemptAt: now,
|
||||
ResolvedEmail: "pilot@example.com",
|
||||
ResolvedLocale: "en",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
},
|
||||
{
|
||||
NotificationID: "1775121700000-0",
|
||||
RouteID: "email:user:user-1",
|
||||
Channel: intentstream.ChannelEmail,
|
||||
RecipientRef: "user:user-1",
|
||||
Status: acceptintent.RouteStatusPending,
|
||||
AttemptCount: 0,
|
||||
MaxAttempts: 7,
|
||||
NextAttemptAt: now,
|
||||
ResolvedEmail: "pilot@example.com",
|
||||
ResolvedLocale: "en",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
},
|
||||
},
|
||||
Idempotency: acceptintent.IdempotencyRecord{
|
||||
Producer: intentstream.ProducerGameMaster,
|
||||
IdempotencyKey: "game-123:turn-54",
|
||||
NotificationID: "1775121700000-0",
|
||||
RequestFingerprint: "sha256:deadbeef",
|
||||
CreatedAt: now,
|
||||
ExpiresAt: now.Add(7 * 24 * time.Hour),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
type runningPushPublisher struct {
|
||||
cancel context.CancelFunc
|
||||
resultCh chan error
|
||||
}
|
||||
|
||||
func runPushPublisher(t *testing.T, publisher *PushPublisher) runningPushPublisher {
|
||||
t.Helper()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
resultCh := make(chan error, 1)
|
||||
go func() {
|
||||
resultCh <- publisher.Run(ctx)
|
||||
}()
|
||||
|
||||
return runningPushPublisher{
|
||||
cancel: cancel,
|
||||
resultCh: resultCh,
|
||||
}
|
||||
}
|
||||
|
||||
func (r runningPushPublisher) stop(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
r.cancel()
|
||||
|
||||
select {
|
||||
case err := <-r.resultCh:
|
||||
require.ErrorIs(t, err, context.Canceled)
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "push publisher did not stop")
|
||||
}
|
||||
}
|
||||
|
||||
type steppingClock struct {
|
||||
mu sync.Mutex
|
||||
current time.Time
|
||||
step time.Duration
|
||||
}
|
||||
|
||||
func newSteppingClock(start time.Time, step time.Duration) *steppingClock {
|
||||
return &steppingClock{
|
||||
current: start.UTC().Truncate(time.Millisecond),
|
||||
step: step,
|
||||
}
|
||||
}
|
||||
|
||||
func (clock *steppingClock) Now() time.Time {
|
||||
clock.mu.Lock()
|
||||
defer clock.mu.Unlock()
|
||||
|
||||
now := clock.current
|
||||
clock.current = clock.current.Add(clock.step).UTC().Truncate(time.Millisecond)
|
||||
|
||||
return now
|
||||
}
|
||||
|
||||
func testWorkerLogger() *slog.Logger {
|
||||
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type recordingWorkerTelemetry struct {
|
||||
mu sync.Mutex
|
||||
|
||||
intentOutcomes []intentOutcomeTelemetryRecord
|
||||
malformedIntents []malformedIntentTelemetryRecord
|
||||
userEnrichment []userEnrichmentTelemetryRecord
|
||||
routePublishAttempts []routePublishTelemetryRecord
|
||||
routeRetries []routeTelemetryRecord
|
||||
routeDeadLetters []routeDeadLetterTelemetryRecord
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordIntentOutcome(_ context.Context, notificationType string, producer string, audienceKind string, outcome string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.intentOutcomes = append(telemetry.intentOutcomes, intentOutcomeTelemetryRecord{
|
||||
notificationType: notificationType,
|
||||
producer: producer,
|
||||
audienceKind: audienceKind,
|
||||
outcome: outcome,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordMalformedIntent(_ context.Context, failureCode string, notificationType string, producer string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.malformedIntents = append(telemetry.malformedIntents, malformedIntentTelemetryRecord{
|
||||
failureCode: failureCode,
|
||||
notificationType: notificationType,
|
||||
producer: producer,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordUserEnrichmentAttempt(_ context.Context, notificationType string, result string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.userEnrichment = append(telemetry.userEnrichment, userEnrichmentTelemetryRecord{
|
||||
notificationType: notificationType,
|
||||
result: result,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordRoutePublishAttempt(_ context.Context, channel string, notificationType string, result string, failureClassification string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.routePublishAttempts = append(telemetry.routePublishAttempts, routePublishTelemetryRecord{
|
||||
channel: channel,
|
||||
notificationType: notificationType,
|
||||
result: result,
|
||||
failureClassification: failureClassification,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordRouteRetry(_ context.Context, channel string, notificationType string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.routeRetries = append(telemetry.routeRetries, routeTelemetryRecord{
|
||||
channel: channel,
|
||||
notificationType: notificationType,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) RecordRouteDeadLetter(_ context.Context, channel string, notificationType string, failureClassification string) {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
telemetry.routeDeadLetters = append(telemetry.routeDeadLetters, routeDeadLetterTelemetryRecord{
|
||||
channel: channel,
|
||||
notificationType: notificationType,
|
||||
failureClassification: failureClassification,
|
||||
})
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) hasIntentOutcome(outcome string) bool {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
for _, record := range telemetry.intentOutcomes {
|
||||
if record.outcome == outcome {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) hasMalformedIntent(failureCode string) bool {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
for _, record := range telemetry.malformedIntents {
|
||||
if record.failureCode == failureCode {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) hasRoutePublishAttempt(channel string, result string, failureClassification string) bool {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
for _, record := range telemetry.routePublishAttempts {
|
||||
if record.channel == channel && record.result == result && record.failureClassification == failureClassification {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) hasRouteRetry(channel string) bool {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
for _, record := range telemetry.routeRetries {
|
||||
if record.channel == channel {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (telemetry *recordingWorkerTelemetry) hasRouteDeadLetter(channel string, failureClassification string) bool {
|
||||
telemetry.mu.Lock()
|
||||
defer telemetry.mu.Unlock()
|
||||
|
||||
for _, record := range telemetry.routeDeadLetters {
|
||||
if record.channel == channel && record.failureClassification == failureClassification {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type intentOutcomeTelemetryRecord struct {
|
||||
notificationType string
|
||||
producer string
|
||||
audienceKind string
|
||||
outcome string
|
||||
}
|
||||
|
||||
type malformedIntentTelemetryRecord struct {
|
||||
failureCode string
|
||||
notificationType string
|
||||
producer string
|
||||
}
|
||||
|
||||
type userEnrichmentTelemetryRecord struct {
|
||||
notificationType string
|
||||
result string
|
||||
}
|
||||
|
||||
type routePublishTelemetryRecord struct {
|
||||
channel string
|
||||
notificationType string
|
||||
result string
|
||||
failureClassification string
|
||||
}
|
||||
|
||||
type routeTelemetryRecord struct {
|
||||
channel string
|
||||
notificationType string
|
||||
}
|
||||
|
||||
type routeDeadLetterTelemetryRecord struct {
|
||||
channel string
|
||||
notificationType string
|
||||
failureClassification string
|
||||
}
|
||||
Reference in New Issue
Block a user