Files
galaxy-game/notification/internal/adapters/redisstate/route_state_store.go
T
2026-04-22 08:49:45 +02:00

658 lines
20 KiB
Go

package redisstate
import (
"bytes"
"context"
"errors"
"fmt"
"sort"
"strconv"
"time"
"galaxy/notification/internal/service/acceptintent"
"galaxy/notification/internal/telemetry"
"github.com/redis/go-redis/v9"
)
var releaseRouteLeaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
end
return 0
`)
var completePublishedRouteScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) ~= ARGV[1] then
return 0
end
if redis.call("GET", KEYS[2]) ~= ARGV[2] then
return 0
end
local field_count = tonumber(ARGV[6])
local values = {}
local index = 7
for _ = 1, field_count do
table.insert(values, ARGV[index])
table.insert(values, ARGV[index + 1])
index = index + 2
end
if tonumber(ARGV[4]) > 0 then
redis.call("XADD", ARGV[3], "MAXLEN", "~", ARGV[4], "*", unpack(values))
else
redis.call("XADD", ARGV[3], "*", unpack(values))
end
redis.call("SET", KEYS[1], ARGV[5], "KEEPTTL")
redis.call("ZREM", KEYS[3], KEYS[1])
redis.call("DEL", KEYS[2])
return 1
`)
// ScheduledRoute stores one due route reference loaded from
// `notification:route_schedule`.
type ScheduledRoute struct {
// RouteKey stores the full Redis route key scheduled for processing.
RouteKey string
// NotificationID stores the owning notification identifier.
NotificationID string
// RouteID stores the scheduled route identifier.
RouteID string
}
// CompleteRoutePublishedInput stores the data required to mark one route as
// published while atomically appending one outbound stream entry.
type CompleteRoutePublishedInput struct {
// ExpectedRoute stores the current route state previously loaded by the
// caller.
ExpectedRoute acceptintent.NotificationRoute
// LeaseToken stores the route-lease owner token that must still be held.
LeaseToken string
// PublishedAt stores when the publication attempt succeeded.
PublishedAt time.Time
// Stream stores the outbound Redis Stream name.
Stream string
// StreamMaxLen bounds Stream with approximate trimming when positive. Zero
// disables trimming.
StreamMaxLen int64
// StreamValues stores the exact Redis Stream fields appended to Stream.
StreamValues map[string]any
}
// CompleteRouteFailedInput stores the data required to record one retryable
// publication failure.
type CompleteRouteFailedInput struct {
// ExpectedRoute stores the current route state previously loaded by the
// caller.
ExpectedRoute acceptintent.NotificationRoute
// LeaseToken stores the route-lease owner token that must still be held.
LeaseToken string
// FailedAt stores when the publication attempt failed.
FailedAt time.Time
// NextAttemptAt stores the next scheduled retry time.
NextAttemptAt time.Time
// FailureClassification stores the classified publication failure kind.
FailureClassification string
// FailureMessage stores the detailed publication failure text.
FailureMessage string
}
// CompleteRouteDeadLetterInput stores the data required to record one
// exhausted publication failure.
type CompleteRouteDeadLetterInput struct {
// ExpectedRoute stores the current route state previously loaded by the
// caller.
ExpectedRoute acceptintent.NotificationRoute
// LeaseToken stores the route-lease owner token that must still be held.
LeaseToken string
// DeadLetteredAt stores when the route exhausted its retry budget.
DeadLetteredAt time.Time
// FailureClassification stores the classified terminal failure kind.
FailureClassification string
// FailureMessage stores the detailed terminal failure text.
FailureMessage string
// RecoveryHint stores the optional operator-facing recovery guidance.
RecoveryHint string
}
// ListDueRoutes loads up to limit scheduled routes whose next-attempt score is
// due at or before now.
func (store *AcceptanceStore) ListDueRoutes(ctx context.Context, now time.Time, limit int64) ([]ScheduledRoute, error) {
if store == nil || store.client == nil {
return nil, errors.New("list due routes: nil store")
}
if ctx == nil {
return nil, errors.New("list due routes: nil context")
}
if err := validateRouteStateTimestamp("list due routes now", now); err != nil {
return nil, err
}
if limit <= 0 {
return nil, errors.New("list due routes: limit must be positive")
}
members, err := store.client.ZRangeByScore(ctx, store.keys.RouteSchedule(), &redis.ZRangeBy{
Min: "-inf",
Max: strconv.FormatInt(now.UnixMilli(), 10),
Count: limit,
}).Result()
if err != nil {
return nil, fmt.Errorf("list due routes: %w", err)
}
routes := make([]ScheduledRoute, 0, len(members))
for _, member := range members {
notificationID, routeID, err := store.keys.ParseRoute(member)
if err != nil {
return nil, fmt.Errorf("list due routes: %w", err)
}
routes = append(routes, ScheduledRoute{
RouteKey: member,
NotificationID: notificationID,
RouteID: routeID,
})
}
return routes, nil
}
// ReadRouteScheduleSnapshot returns the current depth of the durable route
// schedule together with its oldest scheduled timestamp when one exists.
func (store *AcceptanceStore) ReadRouteScheduleSnapshot(ctx context.Context) (telemetry.RouteScheduleSnapshot, error) {
if store == nil || store.client == nil {
return telemetry.RouteScheduleSnapshot{}, errors.New("read route schedule snapshot: nil store")
}
if ctx == nil {
return telemetry.RouteScheduleSnapshot{}, errors.New("read route schedule snapshot: nil context")
}
depth, err := store.client.ZCard(ctx, store.keys.RouteSchedule()).Result()
if err != nil {
return telemetry.RouteScheduleSnapshot{}, fmt.Errorf("read route schedule snapshot: depth: %w", err)
}
snapshot := telemetry.RouteScheduleSnapshot{
Depth: depth,
}
if depth == 0 {
return snapshot, nil
}
values, err := store.client.ZRangeWithScores(ctx, store.keys.RouteSchedule(), 0, 0).Result()
if err != nil {
return telemetry.RouteScheduleSnapshot{}, fmt.Errorf("read route schedule snapshot: oldest scheduled entry: %w", err)
}
if len(values) == 0 {
return snapshot, nil
}
oldestScheduledFor := time.UnixMilli(int64(values[0].Score)).UTC()
snapshot.OldestScheduledFor = &oldestScheduledFor
return snapshot, nil
}
// TryAcquireRouteLease attempts to acquire one temporary route lease owned by
// token for ttl.
func (store *AcceptanceStore) TryAcquireRouteLease(ctx context.Context, notificationID string, routeID string, token string, ttl time.Duration) (bool, error) {
if store == nil || store.client == nil {
return false, errors.New("try acquire route lease: nil store")
}
if ctx == nil {
return false, errors.New("try acquire route lease: nil context")
}
if notificationID == "" {
return false, errors.New("try acquire route lease: notification id must not be empty")
}
if routeID == "" {
return false, errors.New("try acquire route lease: route id must not be empty")
}
if token == "" {
return false, errors.New("try acquire route lease: token must not be empty")
}
if ttl <= 0 {
return false, errors.New("try acquire route lease: ttl must be positive")
}
acquired, err := store.client.SetNX(ctx, store.keys.RouteLease(notificationID, routeID), token, ttl).Result()
if err != nil {
return false, fmt.Errorf("try acquire route lease: %w", err)
}
return acquired, nil
}
// ReleaseRouteLease releases one temporary route lease only when token still
// matches the stored owner value.
func (store *AcceptanceStore) ReleaseRouteLease(ctx context.Context, notificationID string, routeID string, token string) error {
if store == nil || store.client == nil {
return errors.New("release route lease: nil store")
}
if ctx == nil {
return errors.New("release route lease: nil context")
}
if notificationID == "" {
return errors.New("release route lease: notification id must not be empty")
}
if routeID == "" {
return errors.New("release route lease: route id must not be empty")
}
if token == "" {
return errors.New("release route lease: token must not be empty")
}
if err := releaseRouteLeaseScript.Run(
ctx,
store.client,
[]string{store.keys.RouteLease(notificationID, routeID)},
token,
).Err(); err != nil {
return fmt.Errorf("release route lease: %w", err)
}
return nil
}
// CompleteRoutePublished atomically appends one outbound stream entry and
// marks the corresponding route as published.
func (store *AcceptanceStore) CompleteRoutePublished(ctx context.Context, input CompleteRoutePublishedInput) error {
if store == nil || store.client == nil {
return errors.New("complete route published: nil store")
}
if ctx == nil {
return errors.New("complete route published: nil context")
}
if err := input.Validate(); err != nil {
return fmt.Errorf("complete route published: %w", err)
}
updatedRoute := input.ExpectedRoute
updatedRoute.Status = acceptintent.RouteStatusPublished
updatedRoute.AttemptCount++
updatedRoute.NextAttemptAt = time.Time{}
updatedRoute.LastErrorClassification = ""
updatedRoute.LastErrorMessage = ""
updatedRoute.LastErrorAt = time.Time{}
updatedRoute.UpdatedAt = input.PublishedAt
updatedRoute.PublishedAt = input.PublishedAt
updatedRoute.DeadLetteredAt = time.Time{}
payload, err := MarshalRoute(updatedRoute)
if err != nil {
return fmt.Errorf("complete route published: %w", err)
}
expectedPayload, err := MarshalRoute(input.ExpectedRoute)
if err != nil {
return fmt.Errorf("complete route published: %w", err)
}
streamArgs, err := flattenStreamValues(input.StreamValues)
if err != nil {
return fmt.Errorf("complete route published: %w", err)
}
result, err := completePublishedRouteScript.Run(
ctx,
store.client,
[]string{
store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID),
store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID),
store.keys.RouteSchedule(),
},
append([]any{
string(expectedPayload),
input.LeaseToken,
input.Stream,
input.StreamMaxLen,
string(payload),
len(streamArgs) / 2,
}, streamArgs...)...,
).Int()
switch {
case errors.Is(err, redis.Nil):
return ErrConflict
case err != nil:
return err
case result != 1:
return ErrConflict
default:
return nil
}
}
// CompleteRouteFailed atomically records one retryable publication failure and
// reschedules the route.
func (store *AcceptanceStore) CompleteRouteFailed(ctx context.Context, input CompleteRouteFailedInput) error {
if store == nil || store.client == nil {
return errors.New("complete route failed: nil store")
}
if ctx == nil {
return errors.New("complete route failed: nil context")
}
if err := input.Validate(); err != nil {
return fmt.Errorf("complete route failed: %w", err)
}
updatedRoute := input.ExpectedRoute
updatedRoute.Status = acceptintent.RouteStatusFailed
updatedRoute.AttemptCount++
updatedRoute.NextAttemptAt = input.NextAttemptAt
updatedRoute.LastErrorClassification = input.FailureClassification
updatedRoute.LastErrorMessage = input.FailureMessage
updatedRoute.LastErrorAt = input.FailedAt
updatedRoute.UpdatedAt = input.FailedAt
payload, err := MarshalRoute(updatedRoute)
if err != nil {
return fmt.Errorf("complete route failed: %w", err)
}
return store.completeRouteMutation(ctx, input.ExpectedRoute, input.LeaseToken, func(pipe redis.Pipeliner) error {
pipe.SetArgs(ctx, store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), payload, redis.SetArgs{KeepTTL: true})
pipe.ZAdd(ctx, store.keys.RouteSchedule(), redis.Z{
Score: float64(input.NextAttemptAt.UnixMilli()),
Member: store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID),
})
pipe.Del(ctx, store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID))
return nil
})
}
// CompleteRouteDeadLetter atomically records one exhausted publication
// failure, stores the dead-letter entry, and removes the route from the
// retry schedule.
func (store *AcceptanceStore) CompleteRouteDeadLetter(ctx context.Context, input CompleteRouteDeadLetterInput) error {
if store == nil || store.client == nil {
return errors.New("complete route dead letter: nil store")
}
if ctx == nil {
return errors.New("complete route dead letter: nil context")
}
if err := input.Validate(); err != nil {
return fmt.Errorf("complete route dead letter: %w", err)
}
updatedRoute := input.ExpectedRoute
updatedRoute.Status = acceptintent.RouteStatusDeadLetter
updatedRoute.AttemptCount++
updatedRoute.NextAttemptAt = time.Time{}
updatedRoute.LastErrorClassification = input.FailureClassification
updatedRoute.LastErrorMessage = input.FailureMessage
updatedRoute.LastErrorAt = input.DeadLetteredAt
updatedRoute.UpdatedAt = input.DeadLetteredAt
updatedRoute.DeadLetteredAt = input.DeadLetteredAt
if updatedRoute.AttemptCount < updatedRoute.MaxAttempts {
return fmt.Errorf(
"complete route dead letter: final attempt count %d is below max attempts %d",
updatedRoute.AttemptCount,
updatedRoute.MaxAttempts,
)
}
routePayload, err := MarshalRoute(updatedRoute)
if err != nil {
return fmt.Errorf("complete route dead letter: %w", err)
}
deadLetterPayload, err := MarshalDeadLetter(DeadLetterEntry{
NotificationID: updatedRoute.NotificationID,
RouteID: updatedRoute.RouteID,
Channel: updatedRoute.Channel,
RecipientRef: updatedRoute.RecipientRef,
FinalAttemptCount: updatedRoute.AttemptCount,
MaxAttempts: updatedRoute.MaxAttempts,
FailureClassification: input.FailureClassification,
FailureMessage: input.FailureMessage,
CreatedAt: input.DeadLetteredAt,
RecoveryHint: input.RecoveryHint,
})
if err != nil {
return fmt.Errorf("complete route dead letter: %w", err)
}
return store.completeRouteMutation(ctx, input.ExpectedRoute, input.LeaseToken, func(pipe redis.Pipeliner) error {
pipe.SetArgs(ctx, store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID), routePayload, redis.SetArgs{KeepTTL: true})
pipe.Set(ctx, store.keys.DeadLetter(updatedRoute.NotificationID, updatedRoute.RouteID), deadLetterPayload, store.cfg.DeadLetterTTL)
pipe.ZRem(ctx, store.keys.RouteSchedule(), store.keys.Route(updatedRoute.NotificationID, updatedRoute.RouteID))
pipe.Del(ctx, store.keys.RouteLease(updatedRoute.NotificationID, updatedRoute.RouteID))
return nil
})
}
func (store *AcceptanceStore) completeRouteMutation(
ctx context.Context,
expectedRoute acceptintent.NotificationRoute,
leaseToken string,
mutate func(redis.Pipeliner) error,
) error {
routeKey := store.keys.Route(expectedRoute.NotificationID, expectedRoute.RouteID)
leaseKey := store.keys.RouteLease(expectedRoute.NotificationID, expectedRoute.RouteID)
watchErr := store.client.Watch(ctx, func(tx *redis.Tx) error {
currentRoute, err := loadWatchedRoute(ctx, tx, routeKey)
switch {
case errors.Is(err, redis.Nil):
return ErrConflict
case err != nil:
return err
}
if err := ensureRoutesEqual(expectedRoute, currentRoute); err != nil {
return err
}
leaseValue, err := tx.Get(ctx, leaseKey).Result()
switch {
case errors.Is(err, redis.Nil):
return ErrConflict
case err != nil:
return err
case leaseValue != leaseToken:
return ErrConflict
}
_, err = tx.TxPipelined(ctx, func(pipe redis.Pipeliner) error {
return mutate(pipe)
})
return err
}, routeKey, leaseKey)
switch {
case errors.Is(watchErr, ErrConflict), errors.Is(watchErr, redis.TxFailedErr):
return ErrConflict
case watchErr != nil:
return watchErr
default:
return nil
}
}
func loadWatchedRoute(ctx context.Context, tx *redis.Tx, routeKey string) (acceptintent.NotificationRoute, error) {
payload, err := tx.Get(ctx, routeKey).Bytes()
if err != nil {
return acceptintent.NotificationRoute{}, err
}
return UnmarshalRoute(payload)
}
func ensureRoutesEqual(expected acceptintent.NotificationRoute, actual acceptintent.NotificationRoute) error {
expectedPayload, err := MarshalRoute(expected)
if err != nil {
return fmt.Errorf("marshal expected route: %w", err)
}
actualPayload, err := MarshalRoute(actual)
if err != nil {
return fmt.Errorf("marshal current route: %w", err)
}
if !bytes.Equal(expectedPayload, actualPayload) {
return ErrConflict
}
return nil
}
func validateCompletionRoute(route acceptintent.NotificationRoute) error {
if err := route.Validate(); err != nil {
return err
}
switch route.Status {
case acceptintent.RouteStatusPending, acceptintent.RouteStatusFailed:
return nil
default:
return fmt.Errorf("route status %q is not completable", route.Status)
}
}
func validateStreamValues(values map[string]any) error {
if len(values) == 0 {
return fmt.Errorf("stream values must not be empty")
}
for key, raw := range values {
if key == "" {
return fmt.Errorf("stream values key must not be empty")
}
switch typed := raw.(type) {
case string:
if typed == "" {
return fmt.Errorf("stream values %q must not be empty", key)
}
case []byte:
if len(typed) == 0 {
return fmt.Errorf("stream values %q must not be empty", key)
}
default:
return fmt.Errorf("stream values %q must be string or []byte", key)
}
}
return nil
}
func flattenStreamValues(values map[string]any) ([]any, error) {
keys := make([]string, 0, len(values))
for key := range values {
keys = append(keys, key)
}
sort.Strings(keys)
args := make([]any, 0, len(values)*2)
for _, key := range keys {
args = append(args, key, values[key])
}
return args, nil
}
func validateRouteStateTimestamp(name string, value time.Time) error {
if value.IsZero() {
return fmt.Errorf("%s must not be zero", name)
}
if !value.Equal(value.UTC()) {
return fmt.Errorf("%s must be UTC", name)
}
if !value.Equal(value.Truncate(time.Millisecond)) {
return fmt.Errorf("%s must use millisecond precision", name)
}
return nil
}
// Validate reports whether route contains a complete due-route reference.
func (route ScheduledRoute) Validate() error {
if route.RouteKey == "" {
return fmt.Errorf("scheduled route key must not be empty")
}
if route.NotificationID == "" {
return fmt.Errorf("scheduled route notification id must not be empty")
}
if route.RouteID == "" {
return fmt.Errorf("scheduled route route id must not be empty")
}
return nil
}
// Validate reports whether input contains a complete published-route
// transition.
func (input CompleteRoutePublishedInput) Validate() error {
if err := validateCompletionRoute(input.ExpectedRoute); err != nil {
return err
}
if input.LeaseToken == "" {
return fmt.Errorf("lease token must not be empty")
}
if err := validateRouteStateTimestamp("published at", input.PublishedAt); err != nil {
return err
}
if input.Stream == "" {
return fmt.Errorf("stream must not be empty")
}
if input.StreamMaxLen < 0 {
return fmt.Errorf("stream max len must not be negative")
}
if err := validateStreamValues(input.StreamValues); err != nil {
return err
}
return nil
}
// Validate reports whether input contains a complete retryable failure
// transition.
func (input CompleteRouteFailedInput) Validate() error {
if err := validateCompletionRoute(input.ExpectedRoute); err != nil {
return err
}
if input.LeaseToken == "" {
return fmt.Errorf("lease token must not be empty")
}
if err := validateRouteStateTimestamp("failed at", input.FailedAt); err != nil {
return err
}
if err := validateRouteStateTimestamp("next attempt at", input.NextAttemptAt); err != nil {
return err
}
if input.FailureClassification == "" {
return fmt.Errorf("failure classification must not be empty")
}
if input.FailureMessage == "" {
return fmt.Errorf("failure message must not be empty")
}
return nil
}
// Validate reports whether input contains a complete dead-letter transition.
func (input CompleteRouteDeadLetterInput) Validate() error {
if err := validateCompletionRoute(input.ExpectedRoute); err != nil {
return err
}
if input.LeaseToken == "" {
return fmt.Errorf("lease token must not be empty")
}
if err := validateRouteStateTimestamp("dead lettered at", input.DeadLetteredAt); err != nil {
return err
}
if input.FailureClassification == "" {
return fmt.Errorf("failure classification must not be empty")
}
if input.FailureMessage == "" {
return fmt.Errorf("failure message must not be empty")
}
return nil
}