package redisstate import ( "bytes" "context" "errors" "fmt" "sort" "strconv" "time" "galaxy/notification/internal/service/acceptintent" "galaxy/notification/internal/telemetry" "github.com/redis/go-redis/v9" ) var releaseRouteLeaseScript = redis.NewScript(` if redis.call("GET", KEYS[1]) == ARGV[1] then return redis.call("DEL", KEYS[1]) end return 0 `) var completePublishedRouteScript = redis.NewScript(` if redis.call("GET", KEYS[1]) ~= ARGV[1] then return 0 end if redis.call("GET", KEYS[2]) ~= ARGV[2] then return 0 end local field_count = tonumber(ARGV[6]) local values = {} local index = 7 for _ = 1, field_count do table.insert(values, ARGV[index]) table.insert(values, ARGV[index + 1]) index = index + 2 end if tonumber(ARGV[4]) > 0 then redis.call("XADD", ARGV[3], "MAXLEN", "~", ARGV[4], "*", unpack(values)) else redis.call("XADD", ARGV[3], "*", unpack(values)) end redis.call("SET", KEYS[1], ARGV[5], "KEEPTTL") redis.call("ZREM", KEYS[3], KEYS[1]) redis.call("DEL", KEYS[2]) return 1 `) // ScheduledRoute stores one due route reference loaded from // `notification:route_schedule`. type ScheduledRoute struct { // RouteKey stores the full Redis route key scheduled for processing. RouteKey string // NotificationID stores the owning notification identifier. NotificationID string // RouteID stores the scheduled route identifier. RouteID string } // CompleteRoutePublishedInput stores the data required to mark one route as // published while atomically appending one outbound stream entry. type CompleteRoutePublishedInput struct { // ExpectedRoute stores the current route state previously loaded by the // caller. ExpectedRoute acceptintent.NotificationRoute // LeaseToken stores the route-lease owner token that must still be held. LeaseToken string // PublishedAt stores when the publication attempt succeeded. PublishedAt time.Time // Stream stores the outbound Redis Stream name. Stream string // StreamMaxLen bounds Stream with approximate trimming when positive. Zero // disables trimming. StreamMaxLen int64 // StreamValues stores the exact Redis Stream fields appended to Stream. StreamValues map[string]any } // CompleteRouteFailedInput stores the data required to record one retryable // publication failure. type CompleteRouteFailedInput struct { // ExpectedRoute stores the current route state previously loaded by the // caller. ExpectedRoute acceptintent.NotificationRoute // LeaseToken stores the route-lease owner token that must still be held. LeaseToken string // FailedAt stores when the publication attempt failed. FailedAt time.Time // NextAttemptAt stores the next scheduled retry time. NextAttemptAt time.Time // FailureClassification stores the classified publication failure kind. FailureClassification string // FailureMessage stores the detailed publication failure text. FailureMessage string } // CompleteRouteDeadLetterInput stores the data required to record one // exhausted publication failure. type CompleteRouteDeadLetterInput struct { // ExpectedRoute stores the current route state previously loaded by the // caller. ExpectedRoute acceptintent.NotificationRoute // LeaseToken stores the route-lease owner token that must still be held. LeaseToken string // DeadLetteredAt stores when the route exhausted its retry budget. DeadLetteredAt time.Time // FailureClassification stores the classified terminal failure kind. FailureClassification string // FailureMessage stores the detailed terminal failure text. FailureMessage string // RecoveryHint stores the optional operator-facing recovery guidance. RecoveryHint string } // ListDueRoutes loads up to limit scheduled routes whose next-attempt score is // due at or before now. func (store *AcceptanceStore) ListDueRoutes(ctx context.Context, now time.Time, limit int64) ([]ScheduledRoute, error) { if store == nil || store.client == nil { return nil, errors.New("list due routes: nil store") } if ctx == nil { return nil, errors.New("list due routes: nil context") } if err := validateRouteStateTimestamp("list due routes now", now); err != nil { return nil, err } if limit <= 0 { return nil, errors.New("list due routes: limit must be positive") } members, err := store.client.ZRangeByScore(ctx, store.keys.RouteSchedule(), &redis.ZRangeBy{ Min: "-inf", Max: strconv.FormatInt(now.UnixMilli(), 10), Count: limit, }).Result() if err != nil { return nil, fmt.Errorf("list due routes: %w", err) } routes := make([]ScheduledRoute, 0, len(members)) for _, member := range members { notificationID, routeID, err := store.keys.ParseRoute(member) if err != nil { return nil, fmt.Errorf("list due routes: %w", err) } routes = append(routes, ScheduledRoute{ RouteKey: member, NotificationID: notificationID, RouteID: routeID, }) } return routes, nil } // ReadRouteScheduleSnapshot returns the current depth of the durable route // schedule together with its oldest scheduled timestamp when one exists. func (store *AcceptanceStore) ReadRouteScheduleSnapshot(ctx context.Context) (telemetry.RouteScheduleSnapshot, error) { if store == nil || store.client == nil { return telemetry.RouteScheduleSnapshot{}, errors.New("read route schedule snapshot: nil store") } if ctx == nil { return telemetry.RouteScheduleSnapshot{}, errors.New("read route schedule snapshot: nil context") } depth, err := store.client.ZCard(ctx, store.keys.RouteSchedule()).Result() if err != nil { return telemetry.RouteScheduleSnapshot{}, fmt.Errorf("read route schedule snapshot: depth: %w", err) } snapshot := telemetry.RouteScheduleSnapshot{ Depth: depth, } if depth == 0 { return snapshot, nil } values, err := store.client.ZRangeWithScores(ctx, store.keys.RouteSchedule(), 0, 0).Result() if err != nil { return telemetry.RouteScheduleSnapshot{}, fmt.Errorf("read route schedule snapshot: oldest scheduled entry: %w", err) } if len(values) == 0 { return snapshot, nil } oldestScheduledFor := time.UnixMilli(int64(values[0].Score)).UTC() snapshot.OldestScheduledFor = &oldestScheduledFor return snapshot, nil } // TryAcquireRouteLease attempts to acquire one temporary route lease owned by // token for ttl. func (store *AcceptanceStore) TryAcquireRouteLease(ctx context.Context, notificationID string, routeID string, token string, ttl time.Duration) (bool, error) { if store == nil || store.client == nil { return false, errors.New("try acquire route lease: nil store") } if ctx == nil { return false, errors.New("try acquire route lease: nil context") } if notificationID == "" { return false, errors.New("try acquire route lease: notification id must not be empty") } if routeID == "" { return false, errors.New("try acquire route lease: route id must not be empty") } if token == "" { return false, errors.New("try acquire route lease: token must not be empty") } if ttl <= 0 { return false, errors.New("try acquire route lease: ttl must be positive") } acquired, err := store.client.SetNX(ctx, store.keys.RouteLease(notificationID, routeID), token, ttl).Result() if err != nil { return false, fmt.Errorf("try acquire route lease: %w", err) } return acquired, nil } // ReleaseRouteLease releases one temporary route lease only when token still // matches the stored owner value. func (store *AcceptanceStore) ReleaseRouteLease(ctx context.Context, notificationID string, routeID string, token string) error { if store == nil || store.client == nil { return errors.New("release route lease: nil store") } if ctx == nil { return errors.New("release route lease: nil context") } if notificationID == "" { return errors.New("release route lease: notification id must not be empty") } if routeID == "" { return errors.New("release route lease: route id must not be empty") } if token == "" { return errors.New("release route lease: token must not be empty") } if err := releaseRouteLeaseScript.Run( ctx, store.client, []string{store.keys.RouteLease(notificationID, routeID)}, token, ).Err(); err != nil { return fmt.Errorf("release route lease: %w", err) } return nil } // CompleteRoutePublished atomically appends one outbound stream entry and // marks the corresponding route as published. func (store *AcceptanceStore) CompleteRoutePublished(ctx context.Context, input CompleteRoutePublishedInput) error { if store == nil || store.client == nil { return errors.New("complete route published: nil store") } if ctx == nil { return errors.New("complete route published: nil context") } if err := input.Validate(); err != nil { return fmt.Errorf("complete route published: %w", err) } updatedRoute := input.ExpectedRoute updatedRoute.Status = acceptintent.RouteStatusPublished updatedRoute.AttemptCount++ updatedRoute.NextAttemptAt = time.Time{} updatedRoute.LastErrorClassification = "" updatedRoute.LastErrorMessage = "" updatedRoute.LastErrorAt = time.Time{} updatedRoute.UpdatedAt = input.PublishedAt updatedRoute.PublishedAt = input.PublishedAt updatedRoute.DeadLetteredAt = time.Time{} payload, err := MarshalRoute(updatedRoute) if err != nil { return fmt.Errorf("complete route published: %w", err) } expectedPayload, err := MarshalRoute(input.ExpectedRoute) if err != nil { return fmt.Errorf("complete route published: %w", err) } streamArgs, err := flattenStreamValues(input.StreamValues) if err != nil { return fmt.Errorf("complete route published: %w", err) } result, err := completePublishedRouteScript.Run( ctx, store.client, []string{ store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID), store.keys.RouteSchedule(), }, append([]any{ string(expectedPayload), input.LeaseToken, input.Stream, input.StreamMaxLen, string(payload), len(streamArgs) / 2, }, streamArgs...)..., ).Int() switch { case errors.Is(err, redis.Nil): return ErrConflict case err != nil: return err case result != 1: return ErrConflict default: return nil } } // CompleteRouteFailed atomically records one retryable publication failure and // reschedules the route. func (store *AcceptanceStore) CompleteRouteFailed(ctx context.Context, input CompleteRouteFailedInput) error { if store == nil || store.client == nil { return errors.New("complete route failed: nil store") } if ctx == nil { return errors.New("complete route failed: nil context") } if err := input.Validate(); err != nil { return fmt.Errorf("complete route failed: %w", err) } updatedRoute := input.ExpectedRoute updatedRoute.Status = acceptintent.RouteStatusFailed updatedRoute.AttemptCount++ updatedRoute.NextAttemptAt = input.NextAttemptAt updatedRoute.LastErrorClassification = input.FailureClassification updatedRoute.LastErrorMessage = input.FailureMessage updatedRoute.LastErrorAt = input.FailedAt updatedRoute.UpdatedAt = input.FailedAt payload, err := MarshalRoute(updatedRoute) if err != nil { return fmt.Errorf("complete route failed: %w", err) } return store.completeRouteMutation(ctx, input.ExpectedRoute, input.LeaseToken, func(pipe redis.Pipeliner) error { pipe.SetArgs(ctx, store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), payload, redis.SetArgs{KeepTTL: true}) pipe.ZAdd(ctx, store.keys.RouteSchedule(), redis.Z{ Score: float64(input.NextAttemptAt.UnixMilli()), Member: store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), }) pipe.Del(ctx, store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID)) return nil }) } // CompleteRouteDeadLetter atomically records one exhausted publication // failure, stores the dead-letter entry, and removes the route from the // retry schedule. func (store *AcceptanceStore) CompleteRouteDeadLetter(ctx context.Context, input CompleteRouteDeadLetterInput) error { if store == nil || store.client == nil { return errors.New("complete route dead letter: nil store") } if ctx == nil { return errors.New("complete route dead letter: nil context") } if err := input.Validate(); err != nil { return fmt.Errorf("complete route dead letter: %w", err) } updatedRoute := input.ExpectedRoute updatedRoute.Status = acceptintent.RouteStatusDeadLetter updatedRoute.AttemptCount++ updatedRoute.NextAttemptAt = time.Time{} updatedRoute.LastErrorClassification = input.FailureClassification updatedRoute.LastErrorMessage = input.FailureMessage updatedRoute.LastErrorAt = input.DeadLetteredAt updatedRoute.UpdatedAt = input.DeadLetteredAt updatedRoute.DeadLetteredAt = input.DeadLetteredAt if updatedRoute.AttemptCount < updatedRoute.MaxAttempts { return fmt.Errorf( "complete route dead letter: final attempt count %d is below max attempts %d", updatedRoute.AttemptCount, updatedRoute.MaxAttempts, ) } routePayload, err := MarshalRoute(updatedRoute) if err != nil { return fmt.Errorf("complete route dead letter: %w", err) } deadLetterPayload, err := MarshalDeadLetter(DeadLetterEntry{ NotificationID: updatedRoute.NotificationID, RouteID: updatedRoute.RouteID, Channel: updatedRoute.Channel, RecipientRef: updatedRoute.RecipientRef, FinalAttemptCount: updatedRoute.AttemptCount, MaxAttempts: updatedRoute.MaxAttempts, FailureClassification: input.FailureClassification, FailureMessage: input.FailureMessage, CreatedAt: input.DeadLetteredAt, RecoveryHint: input.RecoveryHint, }) if err != nil { return fmt.Errorf("complete route dead letter: %w", err) } return store.completeRouteMutation(ctx, input.ExpectedRoute, input.LeaseToken, func(pipe redis.Pipeliner) error { pipe.SetArgs(ctx, store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), routePayload, redis.SetArgs{KeepTTL: true}) pipe.Set(ctx, store.keys.DeadLetter(updatedRoute.NotificationID, updatedRoute.RouteID), deadLetterPayload, store.cfg.DeadLetterTTL) pipe.ZRem(ctx, store.keys.RouteSchedule(), store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID)) pipe.Del(ctx, store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID)) return nil }) } func (store *AcceptanceStore) completeRouteMutation( ctx context.Context, expectedRoute acceptintent.NotificationRoute, leaseToken string, mutate func(redis.Pipeliner) error, ) error { routeKey := store.keys.Route(expectedRoute.NotificationID, expectedRoute.RouteID) leaseKey := store.keys.RouteLease(expectedRoute.NotificationID, expectedRoute.RouteID) watchErr := store.client.Watch(ctx, func(tx *redis.Tx) error { currentRoute, err := loadWatchedRoute(ctx, tx, routeKey) switch { case errors.Is(err, redis.Nil): return ErrConflict case err != nil: return err } if err := ensureRoutesEqual(expectedRoute, currentRoute); err != nil { return err } leaseValue, err := tx.Get(ctx, leaseKey).Result() switch { case errors.Is(err, redis.Nil): return ErrConflict case err != nil: return err case leaseValue != leaseToken: return ErrConflict } _, err = tx.TxPipelined(ctx, func(pipe redis.Pipeliner) error { return mutate(pipe) }) return err }, routeKey, leaseKey) switch { case errors.Is(watchErr, ErrConflict), errors.Is(watchErr, redis.TxFailedErr): return ErrConflict case watchErr != nil: return watchErr default: return nil } } func loadWatchedRoute(ctx context.Context, tx *redis.Tx, routeKey string) (acceptintent.NotificationRoute, error) { payload, err := tx.Get(ctx, routeKey).Bytes() if err != nil { return acceptintent.NotificationRoute{}, err } return UnmarshalRoute(payload) } func ensureRoutesEqual(expected acceptintent.NotificationRoute, actual acceptintent.NotificationRoute) error { expectedPayload, err := MarshalRoute(expected) if err != nil { return fmt.Errorf("marshal expected route: %w", err) } actualPayload, err := MarshalRoute(actual) if err != nil { return fmt.Errorf("marshal current route: %w", err) } if !bytes.Equal(expectedPayload, actualPayload) { return ErrConflict } return nil } func validateCompletionRoute(route acceptintent.NotificationRoute) error { if err := route.Validate(); err != nil { return err } switch route.Status { case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed: return nil default: return fmt.Errorf("route status %q is not completable", route.Status) } } func validateStreamValues(values map[string]any) error { if len(values) == 0 { return fmt.Errorf("stream values must not be empty") } for key, raw := range values { if key == "" { return fmt.Errorf("stream values key must not be empty") } switch typed := raw.(type) { case string: if typed == "" { return fmt.Errorf("stream values %q must not be empty", key) } case []byte: if len(typed) == 0 { return fmt.Errorf("stream values %q must not be empty", key) } default: return fmt.Errorf("stream values %q must be string or []byte", key) } } return nil } func flattenStreamValues(values map[string]any) ([]any, error) { keys := make([]string, 0, len(values)) for key := range values { keys = append(keys, key) } sort.Strings(keys) args := make([]any, 0, len(values)*2) for _, key := range keys { args = append(args, key, values[key]) } return args, nil } func validateRouteStateTimestamp(name string, value time.Time) error { if value.IsZero() { return fmt.Errorf("%s must not be zero", name) } if !value.Equal(value.UTC()) { return fmt.Errorf("%s must be UTC", name) } if !value.Equal(value.Truncate(time.Millisecond)) { return fmt.Errorf("%s must use millisecond precision", name) } return nil } // Validate reports whether route contains a complete due-route reference. func (route ScheduledRoute) Validate() error { if route.RouteKey == "" { return fmt.Errorf("scheduled route key must not be empty") } if route.NotificationID == "" { return fmt.Errorf("scheduled route notification id must not be empty") } if route.RouteID == "" { return fmt.Errorf("scheduled route route id must not be empty") } return nil } // Validate reports whether input contains a complete published-route // transition. func (input CompleteRoutePublishedInput) Validate() error { if err := validateCompletionRoute(input.ExpectedRoute); err != nil { return err } if input.LeaseToken == "" { return fmt.Errorf("lease token must not be empty") } if err := validateRouteStateTimestamp("published at", input.PublishedAt); err != nil { return err } if input.Stream == "" { return fmt.Errorf("stream must not be empty") } if input.StreamMaxLen < 0 { return fmt.Errorf("stream max len must not be negative") } if err := validateStreamValues(input.StreamValues); err != nil { return err } return nil } // Validate reports whether input contains a complete retryable failure // transition. func (input CompleteRouteFailedInput) Validate() error { if err := validateCompletionRoute(input.ExpectedRoute); err != nil { return err } if input.LeaseToken == "" { return fmt.Errorf("lease token must not be empty") } if err := validateRouteStateTimestamp("failed at", input.FailedAt); err != nil { return err } if err := validateRouteStateTimestamp("next attempt at", input.NextAttemptAt); err != nil { return err } if input.FailureClassification == "" { return fmt.Errorf("failure classification must not be empty") } if input.FailureMessage == "" { return fmt.Errorf("failure message must not be empty") } return nil } // Validate reports whether input contains a complete dead-letter transition. func (input CompleteRouteDeadLetterInput) Validate() error { if err := validateCompletionRoute(input.ExpectedRoute); err != nil { return err } if input.LeaseToken == "" { return fmt.Errorf("lease token must not be empty") } if err := validateRouteStateTimestamp("dead lettered at", input.DeadLetteredAt); err != nil { return err } if input.FailureClassification == "" { return fmt.Errorf("failure classification must not be empty") } if input.FailureMessage == "" { return fmt.Errorf("failure message must not be empty") } return nil }