feat: notification service

This commit is contained in:
Ilia Denisov
2026-04-22 08:49:45 +02:00
committed by GitHub
parent 5b7593e6f6
commit 32dc29359a
135 changed files with 21828 additions and 130 deletions
+3
View File
@@ -0,0 +1,3 @@
// Package worker provides the long-lived background components used by the
// runnable Notification Service process.
package worker
@@ -0,0 +1,421 @@
package worker
import (
"context"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/notification/internal/adapters/redisstate"
"galaxy/notification/internal/api/intentstream"
"galaxy/notification/internal/logging"
"galaxy/notification/internal/service/acceptintent"
"galaxy/notification/internal/service/publishmail"
)
const (
emailFailureClassificationPayloadEncoding = "payload_encoding_failed"
emailFailureClassificationMailStreamWrite = "mail_stream_publish_failed"
)
// EmailRouteStateStore describes the durable route-state operations required
// by EmailPublisher.
type EmailRouteStateStore interface {
// ListDueRoutes loads due scheduled routes.
ListDueRoutes(context.Context, time.Time, int64) ([]redisstate.ScheduledRoute, error)
// TryAcquireRouteLease attempts to acquire one temporary route lease.
TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error)
// ReleaseRouteLease best-effort releases one temporary route lease.
ReleaseRouteLease(context.Context, string, string, string) error
// GetNotification loads one accepted notification.
GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error)
// GetRoute loads one accepted notification route.
GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error)
// CompleteRoutePublished records one successful publication.
CompleteRoutePublished(context.Context, redisstate.CompleteRoutePublishedInput) error
// CompleteRouteFailed records one retryable publication failure.
CompleteRouteFailed(context.Context, redisstate.CompleteRouteFailedInput) error
// CompleteRouteDeadLetter records one exhausted publication failure.
CompleteRouteDeadLetter(context.Context, redisstate.CompleteRouteDeadLetterInput) error
}
// EmailCommandEncoder encodes one email-capable notification route into a
// Mail Service-compatible generic command.
type EmailCommandEncoder interface {
// Encode converts notification plus route to one outbound command.
Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishmail.Command, error)
}
// EmailPublisherConfig stores the dependencies and policies used by
// EmailPublisher.
type EmailPublisherConfig struct {
// Store owns the durable route-state transitions.
Store EmailRouteStateStore
// MailDeliveryCommandsStream stores the outbound Mail Service command
// stream name.
MailDeliveryCommandsStream string
// RouteLeaseTTL stores the temporary route-lease lifetime.
RouteLeaseTTL time.Duration
// RouteBackoffMin stores the minimum retry backoff.
RouteBackoffMin time.Duration
// RouteBackoffMax stores the maximum retry backoff.
RouteBackoffMax time.Duration
// PollInterval stores how long the worker waits before the next due-route
// scan when no progress was made.
PollInterval time.Duration
// BatchSize stores the maximum number of due schedule members loaded per
// scan.
BatchSize int64
// Encoder stores the email command encoder.
Encoder EmailCommandEncoder
// Telemetry records route publication counters.
Telemetry RoutePublisherTelemetry
// Clock provides wall-clock timestamps.
Clock Clock
}
// EmailPublisher publishes due email routes into the Mail Service command
// stream with retry and dead-letter handling.
type EmailPublisher struct {
store EmailRouteStateStore
mailDeliveryCommandsStream string
routeLeaseTTL time.Duration
routeBackoffMin time.Duration
routeBackoffMax time.Duration
pollInterval time.Duration
batchSize int64
encoder EmailCommandEncoder
telemetry RoutePublisherTelemetry
clock Clock
workerToken string
logger *slog.Logger
}
// NewEmailPublisher constructs the email publication worker.
func NewEmailPublisher(cfg EmailPublisherConfig, logger *slog.Logger) (*EmailPublisher, error) {
switch {
case cfg.Store == nil:
return nil, errors.New("new email publisher: nil store")
case strings.TrimSpace(cfg.MailDeliveryCommandsStream) == "":
return nil, errors.New("new email publisher: mail delivery-commands stream must not be empty")
case cfg.RouteLeaseTTL <= 0:
return nil, errors.New("new email publisher: route lease ttl must be positive")
case cfg.RouteBackoffMin <= 0:
return nil, errors.New("new email publisher: route backoff min must be positive")
case cfg.RouteBackoffMax <= 0:
return nil, errors.New("new email publisher: route backoff max must be positive")
case cfg.RouteBackoffMin > cfg.RouteBackoffMax:
return nil, errors.New("new email publisher: route backoff min must not exceed route backoff max")
}
if cfg.PollInterval <= 0 {
cfg.PollInterval = defaultPushPublisherPollInterval
}
if cfg.BatchSize <= 0 {
cfg.BatchSize = defaultPushPublisherBatchSize
}
if cfg.Clock == nil {
cfg.Clock = systemClock{}
}
if cfg.Encoder == nil {
cfg.Encoder = publishmail.Encoder{}
}
if logger == nil {
logger = slog.Default()
}
workerToken, err := newWorkerToken()
if err != nil {
return nil, fmt.Errorf("new email publisher: %w", err)
}
return &EmailPublisher{
store: cfg.Store,
mailDeliveryCommandsStream: cfg.MailDeliveryCommandsStream,
routeLeaseTTL: cfg.RouteLeaseTTL,
routeBackoffMin: cfg.RouteBackoffMin,
routeBackoffMax: cfg.RouteBackoffMax,
pollInterval: cfg.PollInterval,
batchSize: cfg.BatchSize,
encoder: cfg.Encoder,
telemetry: cfg.Telemetry,
clock: cfg.Clock,
workerToken: workerToken,
logger: logger.With("component", "email_publisher", "stream", cfg.MailDeliveryCommandsStream),
}, nil
}
// Run starts the email publication loop and blocks until ctx is canceled or
// an unexpected publication error occurs.
func (publisher *EmailPublisher) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run email publisher: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
if publisher == nil {
return errors.New("run email publisher: nil publisher")
}
publisher.logger.Info("email publisher started",
"poll_interval", publisher.pollInterval.String(),
"batch_size", publisher.batchSize,
)
for {
progress, err := publisher.publishDueRoutes(ctx)
switch {
case err == nil && progress:
continue
case err == nil:
if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil {
publisher.logger.Info("email publisher stopped")
return waitErr
}
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)):
publisher.logger.Info("email publisher stopped")
return ctx.Err()
default:
return fmt.Errorf("run email publisher: %w", err)
}
}
}
// Shutdown stops the email publisher within ctx. The worker relies on context
// cancellation and a bounded polling interval, so it has no dedicated
// resources to release here.
func (publisher *EmailPublisher) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown email publisher: nil context")
}
if publisher == nil {
return nil
}
return nil
}
func (publisher *EmailPublisher) publishDueRoutes(ctx context.Context) (bool, error) {
now := publisher.now()
dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize)
if err != nil {
return false, err
}
progress := false
for _, dueRoute := range dueRoutes {
if !strings.HasPrefix(dueRoute.RouteID, "email:") {
continue
}
processed, err := publisher.publishRoute(ctx, now, dueRoute)
if err != nil {
return progress, err
}
progress = progress || processed
}
return progress, nil
}
func (publisher *EmailPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute redisstate.ScheduledRoute) (bool, error) {
acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL)
if err != nil {
return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err)
}
if !acquired {
return false, nil
}
defer func() {
releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL)
defer cancel()
_ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken)
}()
notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID)
if err != nil {
return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err)
}
if !found {
return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID)
}
route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID)
if err != nil {
return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err)
}
if !found {
return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID)
}
if route.Channel != intentstream.ChannelEmail {
return false, nil
}
switch route.Status {
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
default:
return false, nil
}
if route.NextAttemptAt.After(now) {
return false, nil
}
command, err := publisher.encoder.Encode(notification, route)
if err != nil {
return publisher.recordFailure(ctx, notification, route, emailFailureClassificationPayloadEncoding, err.Error())
}
err = publisher.store.CompleteRoutePublished(ctx, redisstate.CompleteRoutePublishedInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
PublishedAt: publisher.now(),
Stream: publisher.mailDeliveryCommandsStream,
StreamMaxLen: 0,
StreamValues: command.Values(),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "published", "")
logArgs := logging.RouteAttrs(
notification.NotificationID,
notification.NotificationType,
notification.Producer,
notification.AudienceKind,
notification.IdempotencyKey,
notification.RequestID,
notification.TraceID,
route.RouteID,
route.Channel,
)
logArgs = append(logArgs,
"delivery_id", command.DeliveryID,
"resolved_email", route.ResolvedEmail,
)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
publisher.logger.Info("email route published", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return publisher.recordFailure(ctx, notification, route, emailFailureClassificationMailStreamWrite, err.Error())
}
}
func (publisher *EmailPublisher) recordFailure(
ctx context.Context,
notification acceptintent.NotificationRecord,
route acceptintent.NotificationRoute,
classification string,
message string,
) (bool, error) {
failureAt := publisher.now()
attemptNumber := route.AttemptCount + 1
logArgs := logging.RouteAttrs(
notification.NotificationID,
notification.NotificationType,
notification.Producer,
notification.AudienceKind,
notification.IdempotencyKey,
notification.RequestID,
notification.TraceID,
route.RouteID,
route.Channel,
)
logArgs = append(logArgs,
"resolved_email", route.ResolvedEmail,
"failure_classification", classification,
"failure_message", strings.TrimSpace(message),
"attempt_number", attemptNumber,
"max_attempts", route.MaxAttempts,
)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
if attemptNumber >= route.MaxAttempts {
err := publisher.store.CompleteRouteDeadLetter(ctx, redisstate.CompleteRouteDeadLetterInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
DeadLetteredAt: failureAt,
FailureClassification: classification,
FailureMessage: strings.TrimSpace(message),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification)
publisher.recordRouteDeadLetter(ctx, notification, route, classification)
publisher.logger.Warn("email route dead-lettered", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err)
}
}
nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond)
err := publisher.store.CompleteRouteFailed(ctx, redisstate.CompleteRouteFailedInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
FailedAt: failureAt,
NextAttemptAt: nextAttemptAt,
FailureClassification: classification,
FailureMessage: strings.TrimSpace(message),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "retry", classification)
publisher.recordRouteRetry(ctx, notification, route)
logArgs = append(logArgs, "next_attempt_at", nextAttemptAt)
publisher.logger.Warn("email route failed and was rescheduled", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err)
}
}
func (publisher *EmailPublisher) now() time.Time {
return publisher.clock.Now().UTC().Truncate(time.Millisecond)
}
func (publisher *EmailPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification)
}
func (publisher *EmailPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType))
}
func (publisher *EmailPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification)
}
@@ -0,0 +1,232 @@
package worker
import (
"context"
"testing"
"time"
redisstate "galaxy/notification/internal/adapters/redisstate"
"galaxy/notification/internal/service/acceptintent"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/require"
)
func TestEmailPublisherPublishesDueEmailRouteAndLeavesPushRoutePending(t *testing.T) {
t.Parallel()
fixture := newEmailPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
running := runEmailPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
}, time.Second, 10*time.Millisecond)
pushRoute, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
require.NoError(t, err)
require.True(t, found)
require.Equal(t, acceptintent.RouteStatusPending, pushRoute.Status)
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
require.Equal(t, "1775121700000-0/email:user:user-1", messages[0].Values["delivery_id"])
require.Equal(t, "notification", messages[0].Values["source"])
require.Equal(t, "template", messages[0].Values["payload_mode"])
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "published", ""))
}
func TestEmailPublisherRetriesMailStreamPublicationFailures(t *testing.T) {
t.Parallel()
fixture := newEmailPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
require.NoError(t, fixture.client.Set(context.Background(), fixture.mailStream, "wrong-type", 0).Err())
running := runEmailPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusFailed && route.AttemptCount == 1
}, time.Second, 10*time.Millisecond)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "retry", emailFailureClassificationMailStreamWrite))
require.True(t, fixture.telemetry.hasRouteRetry("email"))
require.NoError(t, fixture.client.Del(context.Background(), fixture.mailStream).Err())
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished && route.AttemptCount == 2
}, 2*time.Second, 10*time.Millisecond)
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "published", ""))
}
func TestEmailPublisherLeasePreventsDuplicatePublicationAcrossReplicas(t *testing.T) {
t.Parallel()
fixture := newEmailPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 0)))
otherPublisher, err := NewEmailPublisher(EmailPublisherConfig{
Store: fixture.store,
MailDeliveryCommandsStream: fixture.mailStream,
RouteLeaseTTL: 200 * time.Millisecond,
RouteBackoffMin: 20 * time.Millisecond,
RouteBackoffMax: 20 * time.Millisecond,
PollInterval: 10 * time.Millisecond,
BatchSize: 16,
Clock: newSteppingClock(fixture.now, time.Millisecond),
}, testWorkerLogger())
require.NoError(t, err)
first := runEmailPublisher(t, fixture.publisher)
defer first.stop(t)
second := runEmailPublisher(t, otherPublisher)
defer second.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
}, time.Second, 10*time.Millisecond)
messages, err := fixture.client.XRange(context.Background(), fixture.mailStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
}
func TestEmailPublisherDeadLettersExhaustedRoute(t *testing.T) {
t.Parallel()
fixture := newEmailPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validEmailAcceptanceInput(fixture.now, 6)))
require.NoError(t, fixture.client.Set(context.Background(), fixture.mailStream, "wrong-type", 0).Err())
running := runEmailPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusDeadLetter && route.AttemptCount == 7
}, time.Second, 10*time.Millisecond)
deadLetterPayload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.DeadLetter("1775121700000-0", "email:user:user-1")).Bytes()
require.NoError(t, err)
deadLetter, err := redisstate.UnmarshalDeadLetter(deadLetterPayload)
require.NoError(t, err)
require.Equal(t, emailFailureClassificationMailStreamWrite, deadLetter.FailureClassification)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("email", "dead_letter", emailFailureClassificationMailStreamWrite))
require.True(t, fixture.telemetry.hasRouteDeadLetter("email", emailFailureClassificationMailStreamWrite))
}
type emailPublisherFixture struct {
client *redis.Client
store *redisstate.AcceptanceStore
publisher *EmailPublisher
mailStream string
now time.Time
clock *steppingClock
telemetry *recordingWorkerTelemetry
}
func newEmailPublisherFixture(t *testing.T) emailPublisherFixture {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{
Addr: server.Addr(),
Protocol: 2,
DisableIdentity: true,
})
t.Cleanup(func() {
require.NoError(t, client.Close())
})
store, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
RecordTTL: 24 * time.Hour,
DeadLetterTTL: 72 * time.Hour,
IdempotencyTTL: 7 * 24 * time.Hour,
})
require.NoError(t, err)
now := time.UnixMilli(1775121700000).UTC()
clock := newSteppingClock(now, time.Millisecond)
telemetry := &recordingWorkerTelemetry{}
publisher, err := NewEmailPublisher(EmailPublisherConfig{
Store: store,
MailDeliveryCommandsStream: "mail:delivery_commands",
RouteLeaseTTL: 200 * time.Millisecond,
RouteBackoffMin: 20 * time.Millisecond,
RouteBackoffMax: 20 * time.Millisecond,
PollInterval: 10 * time.Millisecond,
BatchSize: 16,
Telemetry: telemetry,
Clock: clock,
}, testWorkerLogger())
require.NoError(t, err)
return emailPublisherFixture{
client: client,
store: store,
publisher: publisher,
mailStream: "mail:delivery_commands",
now: now,
clock: clock,
telemetry: telemetry,
}
}
func validEmailAcceptanceInput(now time.Time, emailAttemptCount int) acceptintent.CreateAcceptanceInput {
input := validPushAcceptanceInput(now)
for index := range input.Routes {
if input.Routes[index].RouteID != "email:user:user-1" {
continue
}
input.Routes[index].AttemptCount = emailAttemptCount
input.Routes[index].MaxAttempts = 7
}
return input
}
type runningEmailPublisher struct {
cancel context.CancelFunc
resultCh chan error
}
func runEmailPublisher(t *testing.T, publisher *EmailPublisher) runningEmailPublisher {
t.Helper()
ctx, cancel := context.WithCancel(context.Background())
resultCh := make(chan error, 1)
go func() {
resultCh <- publisher.Run(ctx)
}()
return runningEmailPublisher{
cancel: cancel,
resultCh: resultCh,
}
}
func (r runningEmailPublisher) stop(t *testing.T) {
t.Helper()
r.cancel()
select {
case err := <-r.resultCh:
require.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
require.FailNow(t, "email publisher did not stop")
}
}
@@ -0,0 +1,331 @@
package worker
import (
"context"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/notification/internal/api/intentstream"
"galaxy/notification/internal/logging"
"galaxy/notification/internal/service/acceptintent"
"galaxy/notification/internal/service/malformedintent"
"github.com/redis/go-redis/v9"
)
// AcceptIntentUseCase accepts one normalized notification intent.
type AcceptIntentUseCase interface {
// Execute durably accepts one normalized notification intent.
Execute(context.Context, acceptintent.AcceptInput) (acceptintent.Result, error)
}
// MalformedIntentRecorder stores one operator-visible malformed-intent record.
type MalformedIntentRecorder interface {
// Record persists entry idempotently by stream entry id.
Record(context.Context, malformedintent.Entry) error
}
// StreamOffsetStore stores the last durably processed entry id of one plain
// XREAD consumer.
type StreamOffsetStore interface {
// Load returns the last processed entry id for stream when one is stored.
Load(context.Context, string) (string, bool, error)
// Save stores the last processed entry id for stream.
Save(context.Context, string, string) error
}
// IntentConsumerTelemetry records low-cardinality stream-consumer events.
type IntentConsumerTelemetry interface {
// RecordMalformedIntent records one malformed or rejected notification
// intent.
RecordMalformedIntent(context.Context, string, string, string)
}
// Clock provides the current wall-clock time.
type Clock interface {
// Now returns the current time.
Now() time.Time
}
type systemClock struct{}
func (systemClock) Now() time.Time {
return time.Now()
}
// IntentConsumerConfig stores the dependencies used by IntentConsumer.
type IntentConsumerConfig struct {
// Client stores the Redis client used for XREAD.
Client *redis.Client
// Stream stores the Redis Stream name to consume.
Stream string
// BlockTimeout stores the blocking XREAD timeout.
BlockTimeout time.Duration
// Acceptor durably accepts valid notification intents.
Acceptor AcceptIntentUseCase
// MalformedRecorder persists operator-visible malformed-intent entries.
MalformedRecorder MalformedIntentRecorder
// OffsetStore stores the last durably processed stream entry id.
OffsetStore StreamOffsetStore
// Telemetry records malformed-intent counters.
Telemetry IntentConsumerTelemetry
// Clock provides wall-clock timestamps for malformed-intent records.
Clock Clock
}
// IntentConsumer stores the Redis Streams consumer used for notification
// intent intake.
type IntentConsumer struct {
client *redis.Client
stream string
blockTimeout time.Duration
acceptor AcceptIntentUseCase
malformedRecorder MalformedIntentRecorder
offsetStore StreamOffsetStore
telemetry IntentConsumerTelemetry
clock Clock
logger *slog.Logger
}
// NewIntentConsumer constructs the notification-intent consumer.
func NewIntentConsumer(cfg IntentConsumerConfig, logger *slog.Logger) (*IntentConsumer, error) {
switch {
case cfg.Client == nil:
return nil, errors.New("new intent consumer: nil redis client")
case strings.TrimSpace(cfg.Stream) == "":
return nil, errors.New("new intent consumer: stream must not be empty")
case cfg.BlockTimeout <= 0:
return nil, errors.New("new intent consumer: block timeout must be positive")
case cfg.Acceptor == nil:
return nil, errors.New("new intent consumer: nil acceptor")
case cfg.MalformedRecorder == nil:
return nil, errors.New("new intent consumer: nil malformed recorder")
case cfg.OffsetStore == nil:
return nil, errors.New("new intent consumer: nil offset store")
}
if cfg.Clock == nil {
cfg.Clock = systemClock{}
}
if logger == nil {
logger = slog.Default()
}
return &IntentConsumer{
client: cfg.Client,
stream: cfg.Stream,
blockTimeout: cfg.BlockTimeout,
acceptor: cfg.Acceptor,
malformedRecorder: cfg.MalformedRecorder,
offsetStore: cfg.OffsetStore,
telemetry: cfg.Telemetry,
clock: cfg.Clock,
logger: logger.With("component", "intent_consumer", "stream", cfg.Stream),
}, nil
}
// Run starts the intent consumer and blocks until ctx is canceled or Redis
// returns an unexpected error.
func (consumer *IntentConsumer) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run intent consumer: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
if consumer == nil || consumer.client == nil {
return errors.New("run intent consumer: nil consumer")
}
lastID, found, err := consumer.offsetStore.Load(ctx, consumer.stream)
if err != nil {
return fmt.Errorf("run intent consumer: load stream offset: %w", err)
}
if !found {
lastID = "0-0"
}
consumer.logger.Info("intent consumer started", "block_timeout", consumer.blockTimeout.String(), "start_entry_id", lastID)
for {
streams, err := consumer.client.XRead(ctx, &redis.XReadArgs{
Streams: []string{consumer.stream, lastID},
Count: 1,
Block: consumer.blockTimeout,
}).Result()
switch {
case err == nil:
for _, stream := range streams {
for _, message := range stream.Messages {
if err := consumer.handleMessage(ctx, message); err != nil {
return err
}
if err := consumer.offsetStore.Save(ctx, consumer.stream, message.ID); err != nil {
return fmt.Errorf("run intent consumer: save stream offset: %w", err)
}
lastID = message.ID
}
}
case errors.Is(err, redis.Nil):
continue
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, redis.ErrClosed)):
consumer.logger.Info("intent consumer stopped")
return ctx.Err()
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, redis.ErrClosed):
return fmt.Errorf("run intent consumer: %w", err)
default:
return fmt.Errorf("run intent consumer: %w", err)
}
}
}
func (consumer *IntentConsumer) handleMessage(ctx context.Context, message redis.XMessage) error {
rawFields := cloneRawFields(message.Values)
intent, err := intentstream.DecodeIntent(rawFields)
if err != nil {
return consumer.recordMalformed(
ctx,
message.ID,
rawFields,
intentstream.ClassifyDecodeError(err),
err,
)
}
result, err := consumer.acceptor.Execute(ctx, acceptintent.AcceptInput{
NotificationID: message.ID,
Intent: intent,
})
switch {
case err == nil:
logArgs := []any{
"stream_entry_id", message.ID,
"notification_id", message.ID,
}
logArgs = append(logArgs, logging.IntentAttrs(intent)...)
logArgs = append(logArgs,
"outcome", string(result.Outcome),
)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
consumer.logger.Info("notification intent handled", logArgs...)
return nil
case errors.Is(err, acceptintent.ErrConflict):
return consumer.recordMalformed(ctx, message.ID, rawFields, malformedintent.FailureCodeIdempotencyConflict, err)
case errors.Is(err, acceptintent.ErrRecipientNotFound):
return consumer.recordMalformed(ctx, message.ID, rawFields, malformedintent.FailureCodeRecipientNotFound, err)
case errors.Is(err, acceptintent.ErrServiceUnavailable):
return fmt.Errorf("handle intent %q: %w", message.ID, err)
default:
return fmt.Errorf("handle intent %q: %w", message.ID, err)
}
}
func (consumer *IntentConsumer) recordMalformed(
ctx context.Context,
streamEntryID string,
rawFields map[string]any,
failureCode malformedintent.FailureCode,
cause error,
) error {
entry := malformedintent.Entry{
StreamEntryID: streamEntryID,
NotificationType: optionalRawString(rawFields, "notification_type"),
Producer: optionalRawString(rawFields, "producer"),
IdempotencyKey: optionalRawString(rawFields, "idempotency_key"),
FailureCode: failureCode,
FailureMessage: strings.TrimSpace(cause.Error()),
RawFields: cloneRawFields(rawFields),
RecordedAt: consumer.clock.Now().UTC().Truncate(time.Millisecond),
}
if err := consumer.malformedRecorder.Record(ctx, entry); err != nil {
return fmt.Errorf("record malformed intent %q: %w", streamEntryID, err)
}
if consumer.telemetry != nil {
consumer.telemetry.RecordMalformedIntent(ctx, string(failureCode), entry.NotificationType, entry.Producer)
}
logArgs := []any{
"stream_entry_id", streamEntryID,
"notification_type", entry.NotificationType,
"producer", entry.Producer,
"idempotency_key", entry.IdempotencyKey,
"failure_code", string(entry.FailureCode),
"failure_message", entry.FailureMessage,
}
if traceID := optionalRawString(rawFields, "trace_id"); traceID != "" {
logArgs = append(logArgs, "trace_id", traceID)
}
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
consumer.logger.Warn("notification intent rejected", logArgs...)
return nil
}
func cloneRawFields(values map[string]any) map[string]any {
if values == nil {
return map[string]any{}
}
cloned := make(map[string]any, len(values))
for key, value := range values {
cloned[key] = cloneRawValue(value)
}
return cloned
}
func cloneRawValue(value any) any {
switch typed := value.(type) {
case map[string]any:
return cloneRawFields(typed)
case []any:
cloned := make([]any, len(typed))
for index, item := range typed {
cloned[index] = cloneRawValue(item)
}
return cloned
default:
return typed
}
}
func optionalRawString(values map[string]any, key string) string {
raw, ok := values[key]
if !ok {
return ""
}
switch typed := raw.(type) {
case string:
return typed
case []byte:
return string(typed)
default:
return ""
}
}
// Shutdown stops the intent consumer within ctx. The consumer relies on
// context cancellation and a bounded block timeout, so it has no dedicated
// resources to release here.
func (consumer *IntentConsumer) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown intent consumer: nil context")
}
if consumer == nil {
return nil
}
return nil
}
@@ -0,0 +1,422 @@
package worker
import (
"context"
"errors"
"io"
"log/slog"
"testing"
"time"
redisstate "galaxy/notification/internal/adapters/redisstate"
"galaxy/notification/internal/config"
"galaxy/notification/internal/service/acceptintent"
"galaxy/notification/internal/service/malformedintent"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestIntentConsumerStartsFromZeroOffsetWhenNoStoredOffsetExists(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
records: map[string]acceptintent.UserRecord{
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
},
})
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), messageID)
return err == nil && found
}, time.Second, 10*time.Millisecond)
}
func TestIntentConsumerContinuesFromSavedOffsetAfterRestart(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
records: map[string]acceptintent.UserRecord{
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
},
})
firstID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
require.NoError(t, fixture.offsetStore.Save(context.Background(), fixture.stream, firstID))
secondID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), secondID)
return err == nil && found
}, time.Second, 10*time.Millisecond)
_, found, err := fixture.acceptanceStore.GetNotification(context.Background(), firstID)
require.NoError(t, err)
require.False(t, found)
}
func TestIntentConsumerRecordsIdempotencyConflictsAndAdvancesOffset(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
records: map[string]acceptintent.UserRecord{
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
},
})
firstID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
secondID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(secondID)).Bytes()
if err != nil {
return false
}
entry, err := redisstate.UnmarshalMalformedIntent(payload)
if err != nil {
return false
}
return entry.FailureCode == "idempotency_conflict"
}, time.Second, 10*time.Millisecond)
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
require.NoError(t, err)
require.True(t, found)
require.Equal(t, secondID, offset)
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), firstID)
require.NoError(t, err)
require.True(t, found)
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), secondID)
require.NoError(t, err)
require.False(t, found)
}
func TestIntentConsumerShutdownInterruptsBlockingRead(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{})
ctx, cancel := context.WithCancel(context.Background())
resultCh := make(chan error, 1)
go func() {
resultCh <- fixture.consumer.Run(ctx)
}()
time.Sleep(50 * time.Millisecond)
cancel()
select {
case err := <-resultCh:
require.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
require.FailNow(t, "intent consumer did not stop after shutdown")
}
}
func TestIntentConsumerRecordsRecipientNotFoundAndAdvancesOffset(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{})
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(messageID)).Bytes()
if err != nil {
return false
}
entry, err := redisstate.UnmarshalMalformedIntent(payload)
if err != nil {
return false
}
return entry.FailureCode == malformedintent.FailureCodeRecipientNotFound
}, time.Second, 10*time.Millisecond)
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
require.NoError(t, err)
require.True(t, found)
require.Equal(t, messageID, offset)
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
require.NoError(t, err)
require.False(t, found)
}
func TestIntentConsumerRecordsMalformedIntentAndAdvancesOffset(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
records: map[string]acceptintent.UserRecord{
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
},
})
messageID, err := fixture.client.XAdd(context.Background(), &redis.XAddArgs{
Stream: fixture.stream,
Values: map[string]any{
"notification_type": "game.turn.ready",
"producer": "game_master",
"audience_kind": "user",
"recipient_user_ids_json": `["user-1"]`,
"idempotency_key": "game-123:turn-ready",
"occurred_at_ms": "1775121700000",
},
}).Result()
require.NoError(t, err)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(messageID)).Bytes()
if err != nil {
return false
}
entry, err := redisstate.UnmarshalMalformedIntent(payload)
if err != nil {
return false
}
return entry.FailureCode == malformedintent.FailureCodeInvalidPayload &&
entry.StreamEntryID == messageID
}, time.Second, 10*time.Millisecond)
offset, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
require.NoError(t, err)
require.True(t, found)
require.Equal(t, messageID, offset)
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
require.NoError(t, err)
require.False(t, found)
}
func TestIntentConsumerRecordsTelemetryForOutcomesAndMalformedIntents(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
records: map[string]acceptintent.UserRecord{
"user-1": {Email: "pilot@example.com", PreferredLanguage: "en"},
},
})
addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
conflictID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":55,"game_name":"Nebula Clash","game_id":"game-123"}`)
running := runIntentConsumer(t, fixture.consumer)
defer running.stop(t)
require.Eventually(t, func() bool {
payload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.MalformedIntent(conflictID)).Bytes()
if err != nil {
return false
}
entry, err := redisstate.UnmarshalMalformedIntent(payload)
if err != nil {
return false
}
return entry.FailureCode == malformedintent.FailureCodeIdempotencyConflict
}, time.Second, 10*time.Millisecond)
require.Eventually(t, func() bool {
return fixture.telemetry.hasIntentOutcome("accepted") &&
fixture.telemetry.hasIntentOutcome("duplicate") &&
fixture.telemetry.hasMalformedIntent("idempotency_conflict")
}, time.Second, 10*time.Millisecond)
}
func TestIntentConsumerStopsWithoutAdvancingOffsetWhenUserDirectoryIsUnavailable(t *testing.T) {
t.Parallel()
fixture := newIntentConsumerFixture(t, stubUserDirectory{
err: errors.New("user service unavailable"),
})
messageID := addValidIntent(t, fixture.client, fixture.stream, `{"turn_number":54,"game_name":"Nebula Clash","game_id":"game-123"}`)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
resultCh := make(chan error, 1)
go func() {
resultCh <- fixture.consumer.Run(ctx)
}()
var runErr error
require.Eventually(t, func() bool {
select {
case runErr = <-resultCh:
return true
default:
return false
}
}, time.Second, 10*time.Millisecond)
require.Error(t, runErr)
require.ErrorContains(t, runErr, "user service unavailable")
_, found, err := fixture.offsetStore.Load(context.Background(), fixture.stream)
require.NoError(t, err)
require.False(t, found)
_, found, err = fixture.acceptanceStore.GetNotification(context.Background(), messageID)
require.NoError(t, err)
require.False(t, found)
}
type intentConsumerFixture struct {
client *redis.Client
stream string
acceptanceStore *redisstate.AcceptanceStore
offsetStore *redisstate.StreamOffsetStore
consumer *IntentConsumer
telemetry *recordingWorkerTelemetry
}
func newIntentConsumerFixture(t *testing.T, userDirectory acceptintent.UserDirectory) intentConsumerFixture {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{
Addr: server.Addr(),
Protocol: 2,
DisableIdentity: true,
})
t.Cleanup(func() {
assert.NoError(t, client.Close())
})
acceptanceStore, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
RecordTTL: 24 * time.Hour,
DeadLetterTTL: 72 * time.Hour,
IdempotencyTTL: 7 * 24 * time.Hour,
})
require.NoError(t, err)
malformedStore, err := redisstate.NewMalformedIntentStore(client, 72*time.Hour)
require.NoError(t, err)
offsetStore, err := redisstate.NewStreamOffsetStore(client)
require.NoError(t, err)
telemetry := &recordingWorkerTelemetry{}
service, err := acceptintent.New(acceptintent.Config{
Store: acceptanceStore,
UserDirectory: userDirectory,
Clock: fixedClock{now: time.UnixMilli(1775121700000).UTC()},
Logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
Telemetry: telemetry,
PushMaxAttempts: 3,
EmailMaxAttempts: 7,
IdempotencyTTL: 7 * 24 * time.Hour,
AdminRouting: config.AdminRoutingConfig{},
})
require.NoError(t, err)
consumer, err := NewIntentConsumer(IntentConsumerConfig{
Client: client,
Stream: "notification:intents",
BlockTimeout: 25 * time.Millisecond,
Acceptor: service,
MalformedRecorder: malformedStore,
OffsetStore: offsetStore,
Telemetry: telemetry,
Clock: fixedClock{now: time.UnixMilli(1775121700001).UTC()},
}, slog.New(slog.NewTextHandler(io.Discard, nil)))
require.NoError(t, err)
return intentConsumerFixture{
client: client,
stream: "notification:intents",
acceptanceStore: acceptanceStore,
offsetStore: offsetStore,
consumer: consumer,
telemetry: telemetry,
}
}
func addValidIntent(t *testing.T, client *redis.Client, stream string, payloadJSON string) string {
t.Helper()
messageID, err := client.XAdd(context.Background(), &redis.XAddArgs{
Stream: stream,
Values: map[string]any{
"notification_type": "game.turn.ready",
"producer": "game_master",
"audience_kind": "user",
"recipient_user_ids_json": `["user-1"]`,
"idempotency_key": "game-123:turn-ready",
"occurred_at_ms": "1775121700000",
"payload_json": payloadJSON,
},
}).Result()
require.NoError(t, err)
return messageID
}
type runningIntentConsumer struct {
cancel context.CancelFunc
resultCh chan error
}
func runIntentConsumer(t *testing.T, consumer *IntentConsumer) runningIntentConsumer {
t.Helper()
ctx, cancel := context.WithCancel(context.Background())
resultCh := make(chan error, 1)
go func() {
resultCh <- consumer.Run(ctx)
}()
time.Sleep(50 * time.Millisecond)
return runningIntentConsumer{
cancel: cancel,
resultCh: resultCh,
}
}
func (r runningIntentConsumer) stop(t *testing.T) {
t.Helper()
r.cancel()
select {
case err := <-r.resultCh:
require.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
require.FailNow(t, "intent consumer did not stop")
}
}
type fixedClock struct {
now time.Time
}
func (clock fixedClock) Now() time.Time {
return clock.now
}
type stubUserDirectory struct {
records map[string]acceptintent.UserRecord
err error
}
func (directory stubUserDirectory) GetUserByID(_ context.Context, userID string) (acceptintent.UserRecord, error) {
if directory.err != nil {
return acceptintent.UserRecord{}, directory.err
}
if record, ok := directory.records[userID]; ok {
return record, nil
}
return acceptintent.UserRecord{}, acceptintent.ErrRecipientNotFound
}
@@ -0,0 +1,499 @@
package worker
import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"galaxy/notification/internal/adapters/redisstate"
"galaxy/notification/internal/api/intentstream"
"galaxy/notification/internal/logging"
"galaxy/notification/internal/service/acceptintent"
"galaxy/notification/internal/service/publishpush"
)
const (
defaultPushPublisherPollInterval = 100 * time.Millisecond
defaultPushPublisherBatchSize = 64
pushFailureClassificationPayloadEncoding = "payload_encoding_failed"
pushFailureClassificationGatewayStreamWrite = "gateway_stream_publish_failed"
)
// PushRouteStateStore describes the durable route-state operations required by
// PushPublisher.
type PushRouteStateStore interface {
// ListDueRoutes loads due scheduled routes.
ListDueRoutes(context.Context, time.Time, int64) ([]redisstate.ScheduledRoute, error)
// TryAcquireRouteLease attempts to acquire one temporary route lease.
TryAcquireRouteLease(context.Context, string, string, string, time.Duration) (bool, error)
// ReleaseRouteLease best-effort releases one temporary route lease.
ReleaseRouteLease(context.Context, string, string, string) error
// GetNotification loads one accepted notification.
GetNotification(context.Context, string) (acceptintent.NotificationRecord, bool, error)
// GetRoute loads one accepted notification route.
GetRoute(context.Context, string, string) (acceptintent.NotificationRoute, bool, error)
// CompleteRoutePublished records one successful publication.
CompleteRoutePublished(context.Context, redisstate.CompleteRoutePublishedInput) error
// CompleteRouteFailed records one retryable publication failure.
CompleteRouteFailed(context.Context, redisstate.CompleteRouteFailedInput) error
// CompleteRouteDeadLetter records one exhausted publication failure.
CompleteRouteDeadLetter(context.Context, redisstate.CompleteRouteDeadLetterInput) error
}
// PushEventEncoder encodes one push-capable notification route into a
// Gateway-compatible client event.
type PushEventEncoder interface {
// Encode converts notification plus route to one outbound event.
Encode(acceptintent.NotificationRecord, acceptintent.NotificationRoute) (publishpush.Event, error)
}
// RoutePublisherTelemetry records low-cardinality route publication outcomes.
type RoutePublisherTelemetry interface {
// RecordRoutePublishAttempt records one route publication attempt outcome.
RecordRoutePublishAttempt(context.Context, string, string, string, string)
// RecordRouteRetry records one route retry scheduling event.
RecordRouteRetry(context.Context, string, string)
// RecordRouteDeadLetter records one route transition to dead_letter.
RecordRouteDeadLetter(context.Context, string, string, string)
}
// PushPublisherConfig stores the dependencies and policies used by
// PushPublisher.
type PushPublisherConfig struct {
// Store owns the durable route-state transitions.
Store PushRouteStateStore
// GatewayStream stores the outbound Gateway client-events stream name.
GatewayStream string
// GatewayStreamMaxLen bounds GatewayStream with approximate trimming.
GatewayStreamMaxLen int64
// RouteLeaseTTL stores the temporary route-lease lifetime.
RouteLeaseTTL time.Duration
// RouteBackoffMin stores the minimum retry backoff.
RouteBackoffMin time.Duration
// RouteBackoffMax stores the maximum retry backoff.
RouteBackoffMax time.Duration
// PollInterval stores how long the worker waits before the next due-route
// scan when no progress was made.
PollInterval time.Duration
// BatchSize stores the maximum number of due schedule members loaded per
// scan.
BatchSize int64
// Encoder stores the push payload encoder.
Encoder PushEventEncoder
// Telemetry records route publication counters.
Telemetry RoutePublisherTelemetry
// Clock provides wall-clock timestamps.
Clock Clock
}
// PushPublisher publishes due push routes into the Gateway client-events
// stream with retry and dead-letter handling.
type PushPublisher struct {
store PushRouteStateStore
gatewayStream string
gatewayStreamMaxLen int64
routeLeaseTTL time.Duration
routeBackoffMin time.Duration
routeBackoffMax time.Duration
pollInterval time.Duration
batchSize int64
encoder PushEventEncoder
telemetry RoutePublisherTelemetry
clock Clock
workerToken string
logger *slog.Logger
}
// NewPushPublisher constructs the push publication worker.
func NewPushPublisher(cfg PushPublisherConfig, logger *slog.Logger) (*PushPublisher, error) {
switch {
case cfg.Store == nil:
return nil, errors.New("new push publisher: nil store")
case strings.TrimSpace(cfg.GatewayStream) == "":
return nil, errors.New("new push publisher: gateway stream must not be empty")
case cfg.GatewayStreamMaxLen <= 0:
return nil, errors.New("new push publisher: gateway stream max len must be positive")
case cfg.RouteLeaseTTL <= 0:
return nil, errors.New("new push publisher: route lease ttl must be positive")
case cfg.RouteBackoffMin <= 0:
return nil, errors.New("new push publisher: route backoff min must be positive")
case cfg.RouteBackoffMax <= 0:
return nil, errors.New("new push publisher: route backoff max must be positive")
case cfg.RouteBackoffMin > cfg.RouteBackoffMax:
return nil, errors.New("new push publisher: route backoff min must not exceed route backoff max")
}
if cfg.PollInterval <= 0 {
cfg.PollInterval = defaultPushPublisherPollInterval
}
if cfg.BatchSize <= 0 {
cfg.BatchSize = defaultPushPublisherBatchSize
}
if cfg.Clock == nil {
cfg.Clock = systemClock{}
}
if cfg.Encoder == nil {
cfg.Encoder = publishpush.Encoder{}
}
if logger == nil {
logger = slog.Default()
}
workerToken, err := newWorkerToken()
if err != nil {
return nil, fmt.Errorf("new push publisher: %w", err)
}
return &PushPublisher{
store: cfg.Store,
gatewayStream: cfg.GatewayStream,
gatewayStreamMaxLen: cfg.GatewayStreamMaxLen,
routeLeaseTTL: cfg.RouteLeaseTTL,
routeBackoffMin: cfg.RouteBackoffMin,
routeBackoffMax: cfg.RouteBackoffMax,
pollInterval: cfg.PollInterval,
batchSize: cfg.BatchSize,
encoder: cfg.Encoder,
telemetry: cfg.Telemetry,
clock: cfg.Clock,
workerToken: workerToken,
logger: logger.With("component", "push_publisher", "stream", cfg.GatewayStream),
}, nil
}
// Run starts the push publication loop and blocks until ctx is canceled or an
// unexpected publication error occurs.
func (publisher *PushPublisher) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run push publisher: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
if publisher == nil {
return errors.New("run push publisher: nil publisher")
}
publisher.logger.Info("push publisher started",
"poll_interval", publisher.pollInterval.String(),
"batch_size", publisher.batchSize,
)
for {
progress, err := publisher.publishDueRoutes(ctx)
switch {
case err == nil && progress:
continue
case err == nil:
if waitErr := waitWithContext(ctx, publisher.pollInterval); waitErr != nil {
publisher.logger.Info("push publisher stopped")
return waitErr
}
case ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)):
publisher.logger.Info("push publisher stopped")
return ctx.Err()
default:
return fmt.Errorf("run push publisher: %w", err)
}
}
}
// Shutdown stops the push publisher within ctx. The worker relies on context
// cancellation and a bounded polling interval, so it has no dedicated
// resources to release here.
func (publisher *PushPublisher) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown push publisher: nil context")
}
if publisher == nil {
return nil
}
return nil
}
func (publisher *PushPublisher) publishDueRoutes(ctx context.Context) (bool, error) {
now := publisher.now()
dueRoutes, err := publisher.store.ListDueRoutes(ctx, now, publisher.batchSize)
if err != nil {
return false, err
}
progress := false
for _, dueRoute := range dueRoutes {
if !strings.HasPrefix(dueRoute.RouteID, "push:") {
continue
}
processed, err := publisher.publishRoute(ctx, now, dueRoute)
if err != nil {
return progress, err
}
progress = progress || processed
}
return progress, nil
}
func (publisher *PushPublisher) publishRoute(ctx context.Context, now time.Time, dueRoute redisstate.ScheduledRoute) (bool, error) {
acquired, err := publisher.store.TryAcquireRouteLease(ctx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken, publisher.routeLeaseTTL)
if err != nil {
return false, fmt.Errorf("acquire route lease %q: %w", dueRoute.RouteID, err)
}
if !acquired {
return false, nil
}
defer func() {
releaseCtx, cancel := context.WithTimeout(context.Background(), publisher.routeLeaseTTL)
defer cancel()
_ = publisher.store.ReleaseRouteLease(releaseCtx, dueRoute.NotificationID, dueRoute.RouteID, publisher.workerToken)
}()
notification, found, err := publisher.store.GetNotification(ctx, dueRoute.NotificationID)
if err != nil {
return false, fmt.Errorf("load notification %q: %w", dueRoute.NotificationID, err)
}
if !found {
return false, fmt.Errorf("notification %q is missing for route %q", dueRoute.NotificationID, dueRoute.RouteID)
}
route, found, err := publisher.store.GetRoute(ctx, dueRoute.NotificationID, dueRoute.RouteID)
if err != nil {
return false, fmt.Errorf("load route %q: %w", dueRoute.RouteID, err)
}
if !found {
return false, fmt.Errorf("route %q is missing for notification %q", dueRoute.RouteID, dueRoute.NotificationID)
}
if route.Channel != intentstream.ChannelPush {
return false, nil
}
switch route.Status {
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
default:
return false, nil
}
if route.NextAttemptAt.After(now) {
return false, nil
}
event, err := publisher.encoder.Encode(notification, route)
if err != nil {
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationPayloadEncoding, err.Error())
}
err = publisher.store.CompleteRoutePublished(ctx, redisstate.CompleteRoutePublishedInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
PublishedAt: publisher.now(),
Stream: publisher.gatewayStream,
StreamMaxLen: publisher.gatewayStreamMaxLen,
StreamValues: eventValues(event),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "published", "")
logArgs := logging.RouteAttrs(
notification.NotificationID,
notification.NotificationType,
notification.Producer,
notification.AudienceKind,
notification.IdempotencyKey,
notification.RequestID,
notification.TraceID,
route.RouteID,
route.Channel,
)
logArgs = append(logArgs,
"event_id", event.EventID,
"user_id", event.UserID,
)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
publisher.logger.Info("push route published", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return publisher.recordFailure(ctx, notification, route, pushFailureClassificationGatewayStreamWrite, err.Error())
}
}
func (publisher *PushPublisher) recordFailure(
ctx context.Context,
notification acceptintent.NotificationRecord,
route acceptintent.NotificationRoute,
classification string,
message string,
) (bool, error) {
failureAt := publisher.now()
attemptNumber := route.AttemptCount + 1
logArgs := logging.RouteAttrs(
notification.NotificationID,
notification.NotificationType,
notification.Producer,
notification.AudienceKind,
notification.IdempotencyKey,
notification.RequestID,
notification.TraceID,
route.RouteID,
route.Channel,
)
logArgs = append(logArgs,
"failure_classification", classification,
"failure_message", strings.TrimSpace(message),
"attempt_number", attemptNumber,
"max_attempts", route.MaxAttempts,
)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
if attemptNumber >= route.MaxAttempts {
err := publisher.store.CompleteRouteDeadLetter(ctx, redisstate.CompleteRouteDeadLetterInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
DeadLetteredAt: failureAt,
FailureClassification: classification,
FailureMessage: strings.TrimSpace(message),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "dead_letter", classification)
publisher.recordRouteDeadLetter(ctx, notification, route, classification)
publisher.logger.Warn("push route dead-lettered", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return false, fmt.Errorf("dead-letter route %q: %w", route.RouteID, err)
}
}
nextAttemptAt := failureAt.Add(routeBackoffDelay(attemptNumber, publisher.routeBackoffMin, publisher.routeBackoffMax)).UTC().Truncate(time.Millisecond)
err := publisher.store.CompleteRouteFailed(ctx, redisstate.CompleteRouteFailedInput{
ExpectedRoute: route,
LeaseToken: publisher.workerToken,
FailedAt: failureAt,
NextAttemptAt: nextAttemptAt,
FailureClassification: classification,
FailureMessage: strings.TrimSpace(message),
})
switch {
case err == nil:
publisher.recordPublishAttempt(ctx, notification, route, "retry", classification)
publisher.recordRouteRetry(ctx, notification, route)
logArgs = append(logArgs, "next_attempt_at", nextAttemptAt)
publisher.logger.Warn("push route failed and was rescheduled", logArgs...)
return true, nil
case errors.Is(err, redisstate.ErrConflict):
return false, nil
default:
return false, fmt.Errorf("reschedule route %q: %w", route.RouteID, err)
}
}
func eventValues(event publishpush.Event) map[string]any {
values := map[string]any{
"user_id": event.UserID,
"event_type": event.EventType,
"event_id": event.EventID,
"payload_bytes": append([]byte(nil), event.PayloadBytes...),
}
if event.RequestID != "" {
values["request_id"] = event.RequestID
}
if event.TraceID != "" {
values["trace_id"] = event.TraceID
}
return values
}
func routeBackoffDelay(attemptNumber int, minBackoff time.Duration, maxBackoff time.Duration) time.Duration {
delay := minBackoff
for step := 1; step < attemptNumber; step++ {
if delay >= maxBackoff/2 {
return maxBackoff
}
delay *= 2
}
if delay < minBackoff {
return minBackoff
}
if delay > maxBackoff {
return maxBackoff
}
return delay
}
func waitWithContext(ctx context.Context, delay time.Duration) error {
timer := time.NewTimer(delay)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
func newWorkerToken() (string, error) {
buffer := make([]byte, 16)
if _, err := rand.Read(buffer); err != nil {
return "", fmt.Errorf("generate worker token: %w", err)
}
return hex.EncodeToString(buffer), nil
}
func (publisher *PushPublisher) now() time.Time {
return publisher.clock.Now().UTC().Truncate(time.Millisecond)
}
func (publisher *PushPublisher) recordPublishAttempt(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, result string, classification string) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRoutePublishAttempt(ctx, string(route.Channel), string(notification.NotificationType), result, classification)
}
func (publisher *PushPublisher) recordRouteRetry(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRouteRetry(ctx, string(route.Channel), string(notification.NotificationType))
}
func (publisher *PushPublisher) recordRouteDeadLetter(ctx context.Context, notification acceptintent.NotificationRecord, route acceptintent.NotificationRoute, classification string) {
if publisher == nil || publisher.telemetry == nil {
return
}
publisher.telemetry.RecordRouteDeadLetter(ctx, string(route.Channel), string(notification.NotificationType), classification)
}
@@ -0,0 +1,318 @@
package worker
import (
"context"
"io"
"log/slog"
"sync"
"testing"
"time"
redisstate "galaxy/notification/internal/adapters/redisstate"
"galaxy/notification/internal/api/intentstream"
"galaxy/notification/internal/service/acceptintent"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestPushPublisherPublishesDuePushRouteAndLeavesEmailRoutePending(t *testing.T) {
t.Parallel()
fixture := newPushPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
running := runPushPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
}, time.Second, 10*time.Millisecond)
emailRoute, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "email:user:user-1")
require.NoError(t, err)
require.True(t, found)
require.Equal(t, acceptintent.RouteStatusPending, emailRoute.Status)
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
require.Equal(t, "user-1", messages[0].Values["user_id"])
require.Equal(t, "game.turn.ready", messages[0].Values["event_type"])
require.Equal(t, "1775121700000-0/push:user:user-1", messages[0].Values["event_id"])
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "published", ""))
}
func TestPushPublisherRetriesGatewayStreamPublicationFailures(t *testing.T) {
t.Parallel()
fixture := newPushPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
require.NoError(t, fixture.client.Set(context.Background(), fixture.gatewayStream, "wrong-type", 0).Err())
running := runPushPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusFailed && route.AttemptCount == 1
}, time.Second, 10*time.Millisecond)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "retry", pushFailureClassificationGatewayStreamWrite))
require.True(t, fixture.telemetry.hasRouteRetry("push"))
require.NoError(t, fixture.client.Del(context.Background(), fixture.gatewayStream).Err())
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished && route.AttemptCount == 2
}, 2*time.Second, 10*time.Millisecond)
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "published", ""))
}
func TestPushPublisherDeadLettersExhaustedRoute(t *testing.T) {
t.Parallel()
fixture := newPushPublisherFixture(t)
input := validPushAcceptanceInput(fixture.now)
for index := range input.Routes {
if input.Routes[index].RouteID == "push:user:user-1" {
input.Routes[index].AttemptCount = 2
input.Routes[index].MaxAttempts = 3
}
}
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), input))
require.NoError(t, fixture.client.Set(context.Background(), fixture.gatewayStream, "wrong-type", 0).Err())
running := runPushPublisher(t, fixture.publisher)
defer running.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusDeadLetter && route.AttemptCount == 3
}, time.Second, 10*time.Millisecond)
deadLetterPayload, err := fixture.client.Get(context.Background(), redisstate.Keyspace{}.DeadLetter("1775121700000-0", "push:user:user-1")).Bytes()
require.NoError(t, err)
deadLetter, err := redisstate.UnmarshalDeadLetter(deadLetterPayload)
require.NoError(t, err)
require.Equal(t, pushFailureClassificationGatewayStreamWrite, deadLetter.FailureClassification)
require.True(t, fixture.telemetry.hasRoutePublishAttempt("push", "dead_letter", pushFailureClassificationGatewayStreamWrite))
require.True(t, fixture.telemetry.hasRouteDeadLetter("push", pushFailureClassificationGatewayStreamWrite))
}
func TestPushPublisherLeasePreventsDuplicatePublicationAcrossReplicas(t *testing.T) {
t.Parallel()
fixture := newPushPublisherFixture(t)
require.NoError(t, fixture.store.CreateAcceptance(context.Background(), validPushAcceptanceInput(fixture.now)))
otherPublisher, err := NewPushPublisher(PushPublisherConfig{
Store: fixture.store,
GatewayStream: fixture.gatewayStream,
GatewayStreamMaxLen: 1024,
RouteLeaseTTL: 200 * time.Millisecond,
RouteBackoffMin: 20 * time.Millisecond,
RouteBackoffMax: 20 * time.Millisecond,
PollInterval: 10 * time.Millisecond,
BatchSize: 16,
Clock: newSteppingClock(fixture.now, time.Millisecond),
}, testWorkerLogger())
require.NoError(t, err)
first := runPushPublisher(t, fixture.publisher)
defer first.stop(t)
second := runPushPublisher(t, otherPublisher)
defer second.stop(t)
require.Eventually(t, func() bool {
route, found, err := fixture.store.GetRoute(context.Background(), "1775121700000-0", "push:user:user-1")
return err == nil && found && route.Status == acceptintent.RouteStatusPublished
}, time.Second, 10*time.Millisecond)
messages, err := fixture.client.XRange(context.Background(), fixture.gatewayStream, "-", "+").Result()
require.NoError(t, err)
require.Len(t, messages, 1)
}
type pushPublisherFixture struct {
client *redis.Client
store *redisstate.AcceptanceStore
publisher *PushPublisher
gatewayStream string
now time.Time
clock *steppingClock
telemetry *recordingWorkerTelemetry
}
func newPushPublisherFixture(t *testing.T) pushPublisherFixture {
t.Helper()
server := miniredis.RunT(t)
client := redis.NewClient(&redis.Options{
Addr: server.Addr(),
Protocol: 2,
DisableIdentity: true,
})
t.Cleanup(func() {
assert.NoError(t, client.Close())
})
store, err := redisstate.NewAcceptanceStore(client, redisstate.AcceptanceConfig{
RecordTTL: 24 * time.Hour,
DeadLetterTTL: 72 * time.Hour,
IdempotencyTTL: 7 * 24 * time.Hour,
})
require.NoError(t, err)
now := time.UnixMilli(1775121700000).UTC()
clock := newSteppingClock(now, time.Millisecond)
telemetry := &recordingWorkerTelemetry{}
publisher, err := NewPushPublisher(PushPublisherConfig{
Store: store,
GatewayStream: "gateway:client-events",
GatewayStreamMaxLen: 1024,
RouteLeaseTTL: 200 * time.Millisecond,
RouteBackoffMin: 20 * time.Millisecond,
RouteBackoffMax: 20 * time.Millisecond,
PollInterval: 10 * time.Millisecond,
BatchSize: 16,
Telemetry: telemetry,
Clock: clock,
}, testWorkerLogger())
require.NoError(t, err)
return pushPublisherFixture{
client: client,
store: store,
publisher: publisher,
gatewayStream: "gateway:client-events",
now: now,
clock: clock,
telemetry: telemetry,
}
}
func validPushAcceptanceInput(now time.Time) acceptintent.CreateAcceptanceInput {
return acceptintent.CreateAcceptanceInput{
Notification: acceptintent.NotificationRecord{
NotificationID: "1775121700000-0",
NotificationType: intentstream.NotificationTypeGameTurnReady,
Producer: intentstream.ProducerGameMaster,
AudienceKind: intentstream.AudienceKindUser,
RecipientUserIDs: []string{"user-1"},
PayloadJSON: `{"game_id":"game-123","game_name":"Nebula Clash","turn_number":54}`,
IdempotencyKey: "game-123:turn-54",
RequestFingerprint: "sha256:deadbeef",
RequestID: "request-1",
TraceID: "trace-1",
OccurredAt: now,
AcceptedAt: now,
UpdatedAt: now,
},
Routes: []acceptintent.NotificationRoute{
{
NotificationID: "1775121700000-0",
RouteID: "push:user:user-1",
Channel: intentstream.ChannelPush,
RecipientRef: "user:user-1",
Status: acceptintent.RouteStatusPending,
AttemptCount: 0,
MaxAttempts: 3,
NextAttemptAt: now,
ResolvedEmail: "pilot@example.com",
ResolvedLocale: "en",
CreatedAt: now,
UpdatedAt: now,
},
{
NotificationID: "1775121700000-0",
RouteID: "email:user:user-1",
Channel: intentstream.ChannelEmail,
RecipientRef: "user:user-1",
Status: acceptintent.RouteStatusPending,
AttemptCount: 0,
MaxAttempts: 7,
NextAttemptAt: now,
ResolvedEmail: "pilot@example.com",
ResolvedLocale: "en",
CreatedAt: now,
UpdatedAt: now,
},
},
Idempotency: acceptintent.IdempotencyRecord{
Producer: intentstream.ProducerGameMaster,
IdempotencyKey: "game-123:turn-54",
NotificationID: "1775121700000-0",
RequestFingerprint: "sha256:deadbeef",
CreatedAt: now,
ExpiresAt: now.Add(7 * 24 * time.Hour),
},
}
}
type runningPushPublisher struct {
cancel context.CancelFunc
resultCh chan error
}
func runPushPublisher(t *testing.T, publisher *PushPublisher) runningPushPublisher {
t.Helper()
ctx, cancel := context.WithCancel(context.Background())
resultCh := make(chan error, 1)
go func() {
resultCh <- publisher.Run(ctx)
}()
return runningPushPublisher{
cancel: cancel,
resultCh: resultCh,
}
}
func (r runningPushPublisher) stop(t *testing.T) {
t.Helper()
r.cancel()
select {
case err := <-r.resultCh:
require.ErrorIs(t, err, context.Canceled)
case <-time.After(time.Second):
require.FailNow(t, "push publisher did not stop")
}
}
type steppingClock struct {
mu sync.Mutex
current time.Time
step time.Duration
}
func newSteppingClock(start time.Time, step time.Duration) *steppingClock {
return &steppingClock{
current: start.UTC().Truncate(time.Millisecond),
step: step,
}
}
func (clock *steppingClock) Now() time.Time {
clock.mu.Lock()
defer clock.mu.Unlock()
now := clock.current
clock.current = clock.current.Add(clock.step).UTC().Truncate(time.Millisecond)
return now
}
func testWorkerLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
@@ -0,0 +1,184 @@
package worker
import (
"context"
"sync"
)
type recordingWorkerTelemetry struct {
mu sync.Mutex
intentOutcomes []intentOutcomeTelemetryRecord
malformedIntents []malformedIntentTelemetryRecord
userEnrichment []userEnrichmentTelemetryRecord
routePublishAttempts []routePublishTelemetryRecord
routeRetries []routeTelemetryRecord
routeDeadLetters []routeDeadLetterTelemetryRecord
}
func (telemetry *recordingWorkerTelemetry) RecordIntentOutcome(_ context.Context, notificationType string, producer string, audienceKind string, outcome string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.intentOutcomes = append(telemetry.intentOutcomes, intentOutcomeTelemetryRecord{
notificationType: notificationType,
producer: producer,
audienceKind: audienceKind,
outcome: outcome,
})
}
func (telemetry *recordingWorkerTelemetry) RecordMalformedIntent(_ context.Context, failureCode string, notificationType string, producer string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.malformedIntents = append(telemetry.malformedIntents, malformedIntentTelemetryRecord{
failureCode: failureCode,
notificationType: notificationType,
producer: producer,
})
}
func (telemetry *recordingWorkerTelemetry) RecordUserEnrichmentAttempt(_ context.Context, notificationType string, result string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.userEnrichment = append(telemetry.userEnrichment, userEnrichmentTelemetryRecord{
notificationType: notificationType,
result: result,
})
}
func (telemetry *recordingWorkerTelemetry) RecordRoutePublishAttempt(_ context.Context, channel string, notificationType string, result string, failureClassification string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.routePublishAttempts = append(telemetry.routePublishAttempts, routePublishTelemetryRecord{
channel: channel,
notificationType: notificationType,
result: result,
failureClassification: failureClassification,
})
}
func (telemetry *recordingWorkerTelemetry) RecordRouteRetry(_ context.Context, channel string, notificationType string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.routeRetries = append(telemetry.routeRetries, routeTelemetryRecord{
channel: channel,
notificationType: notificationType,
})
}
func (telemetry *recordingWorkerTelemetry) RecordRouteDeadLetter(_ context.Context, channel string, notificationType string, failureClassification string) {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
telemetry.routeDeadLetters = append(telemetry.routeDeadLetters, routeDeadLetterTelemetryRecord{
channel: channel,
notificationType: notificationType,
failureClassification: failureClassification,
})
}
func (telemetry *recordingWorkerTelemetry) hasIntentOutcome(outcome string) bool {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
for _, record := range telemetry.intentOutcomes {
if record.outcome == outcome {
return true
}
}
return false
}
func (telemetry *recordingWorkerTelemetry) hasMalformedIntent(failureCode string) bool {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
for _, record := range telemetry.malformedIntents {
if record.failureCode == failureCode {
return true
}
}
return false
}
func (telemetry *recordingWorkerTelemetry) hasRoutePublishAttempt(channel string, result string, failureClassification string) bool {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
for _, record := range telemetry.routePublishAttempts {
if record.channel == channel && record.result == result && record.failureClassification == failureClassification {
return true
}
}
return false
}
func (telemetry *recordingWorkerTelemetry) hasRouteRetry(channel string) bool {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
for _, record := range telemetry.routeRetries {
if record.channel == channel {
return true
}
}
return false
}
func (telemetry *recordingWorkerTelemetry) hasRouteDeadLetter(channel string, failureClassification string) bool {
telemetry.mu.Lock()
defer telemetry.mu.Unlock()
for _, record := range telemetry.routeDeadLetters {
if record.channel == channel && record.failureClassification == failureClassification {
return true
}
}
return false
}
type intentOutcomeTelemetryRecord struct {
notificationType string
producer string
audienceKind string
outcome string
}
type malformedIntentTelemetryRecord struct {
failureCode string
notificationType string
producer string
}
type userEnrichmentTelemetryRecord struct {
notificationType string
result string
}
type routePublishTelemetryRecord struct {
channel string
notificationType string
result string
failureClassification string
}
type routeTelemetryRecord struct {
channel string
notificationType string
}
type routeDeadLetterTelemetryRecord struct {
channel string
notificationType string
failureClassification string
}