package worker import ( "context" "errors" "fmt" "log/slog" "time" "galaxy/mail/internal/domain/attempt" "galaxy/mail/internal/domain/common" deliverydomain "galaxy/mail/internal/domain/delivery" "galaxy/mail/internal/logging" "galaxy/mail/internal/service/executeattempt" ) const ( defaultSchedulePollInterval = 250 * time.Millisecond defaultRecoveryInterval = 30 * time.Second defaultRecoveryGrace = 30 * time.Second ) // AttemptExecutionStore describes the durable state operations used by the // attempt scheduler. type AttemptExecutionStore interface { // NextDueDeliveryIDs returns up to limit due delivery identifiers. NextDueDeliveryIDs(context.Context, time.Time, int64) ([]common.DeliveryID, error) // SendingDeliveryIDs returns every delivery currently indexed as sending. SendingDeliveryIDs(context.Context) ([]common.DeliveryID, error) // LoadWorkItem loads the current delivery and active attempt for deliveryID. LoadWorkItem(context.Context, common.DeliveryID) (executeattempt.WorkItem, bool, error) // ClaimDueAttempt atomically claims the due scheduled attempt for // deliveryID. ClaimDueAttempt(context.Context, common.DeliveryID, time.Time) (executeattempt.WorkItem, bool, error) // RemoveScheduledDelivery removes deliveryID from the attempt schedule set. RemoveScheduledDelivery(context.Context, common.DeliveryID) error } // AttemptPreparationService prepares queued template deliveries and recovers // stale claimed attempts. type AttemptPreparationService interface { // Prepare renders one queued template delivery when needed and reports // whether the scheduler may continue to claim the attempt. Prepare(context.Context, executeattempt.WorkItem) (bool, error) // RecoverExpired marks one stale in-progress attempt as timed out. RecoverExpired(context.Context, executeattempt.WorkItem) error } // SchedulerTelemetry records low-cardinality scheduler-side delivery // transitions. type SchedulerTelemetry interface { // RecordDeliveryStatusTransition records one durable delivery status // transition. RecordDeliveryStatusTransition(context.Context, string, string) } // SchedulerConfig stores the dependencies used by Scheduler. type SchedulerConfig struct { // Store owns the durable scheduled and in-progress attempt state. Store AttemptExecutionStore // Service prepares queued template deliveries and recovers stale claims. Service AttemptPreparationService // WorkQueue stores the claimed attempt handoff channel consumed by the // attempt worker pool. WorkQueue chan<- executeattempt.WorkItem // Clock provides the scheduler wall clock. Clock Clock // AttemptTimeout stores the provider execution budget used to derive claim // recovery deadlines. AttemptTimeout time.Duration // Telemetry records scheduler-side delivery transitions. Telemetry SchedulerTelemetry // PollInterval overrides the default due-attempt polling interval when // positive. PollInterval time.Duration // RecoveryInterval overrides the default stale-claim recovery interval when // positive. RecoveryInterval time.Duration // RecoveryGrace overrides the default stale-claim grace window when // positive. RecoveryGrace time.Duration } // Scheduler polls due attempts, optionally renders queued template // deliveries, atomically claims runnable work, and recovers stale in-progress // ownership. type Scheduler struct { store AttemptExecutionStore service AttemptPreparationService workQueue chan<- executeattempt.WorkItem clock Clock attemptTimeout time.Duration telemetry SchedulerTelemetry pollInterval time.Duration recoveryInterval time.Duration recoveryGrace time.Duration logger *slog.Logger } // NewScheduler constructs one attempt scheduler. func NewScheduler(cfg SchedulerConfig, logger *slog.Logger) (*Scheduler, error) { switch { case cfg.Store == nil: return nil, errors.New("new scheduler: nil attempt execution store") case cfg.Service == nil: return nil, errors.New("new scheduler: nil attempt preparation service") case cfg.WorkQueue == nil: return nil, errors.New("new scheduler: nil work queue") case cfg.Clock == nil: return nil, errors.New("new scheduler: nil clock") case cfg.AttemptTimeout <= 0: return nil, errors.New("new scheduler: non-positive attempt timeout") } if logger == nil { logger = slog.Default() } pollInterval := cfg.PollInterval if pollInterval <= 0 { pollInterval = defaultSchedulePollInterval } recoveryInterval := cfg.RecoveryInterval if recoveryInterval <= 0 { recoveryInterval = defaultRecoveryInterval } recoveryGrace := cfg.RecoveryGrace if recoveryGrace <= 0 { recoveryGrace = defaultRecoveryGrace } return &Scheduler{ store: cfg.Store, service: cfg.Service, workQueue: cfg.WorkQueue, clock: cfg.Clock, attemptTimeout: cfg.AttemptTimeout, telemetry: cfg.Telemetry, pollInterval: pollInterval, recoveryInterval: recoveryInterval, recoveryGrace: recoveryGrace, logger: logger.With( "component", "scheduler", "poll_interval", pollInterval.String(), "recovery_interval", recoveryInterval.String(), "recovery_grace", recoveryGrace.String(), ), }, nil } // Run starts the scheduler loop and blocks until ctx is canceled or one // durable state operation fails. func (scheduler *Scheduler) Run(ctx context.Context) error { if ctx == nil { return errors.New("run scheduler: nil context") } if err := ctx.Err(); err != nil { return err } if scheduler == nil { return errors.New("run scheduler: nil scheduler") } scheduler.logger.Info("scheduler started") defer scheduler.logger.Info("scheduler stopped") if err := scheduler.recoverExpired(ctx); err != nil { return err } pollTicker := time.NewTicker(scheduler.pollInterval) defer pollTicker.Stop() recoveryTicker := time.NewTicker(scheduler.recoveryInterval) defer recoveryTicker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-pollTicker.C: if err := scheduler.dispatchDueAttempts(ctx); err != nil { return err } case <-recoveryTicker.C: if err := scheduler.recoverExpired(ctx); err != nil { return err } } } } // Shutdown stops the scheduler within ctx. The scheduler does not own // additional resources beyond its run loop. func (scheduler *Scheduler) Shutdown(ctx context.Context) error { if ctx == nil { return errors.New("shutdown scheduler: nil context") } if scheduler == nil { return nil } return nil } func (scheduler *Scheduler) dispatchDueAttempts(ctx context.Context) error { for { now := scheduler.clock.Now().UTC().Truncate(time.Millisecond) deliveryIDs, err := scheduler.store.NextDueDeliveryIDs(ctx, now, 1) if err != nil { return fmt.Errorf("dispatch due attempts: %w", err) } if len(deliveryIDs) == 0 { return nil } if err := scheduler.dispatchOne(ctx, deliveryIDs[0], now); err != nil { return err } } } func (scheduler *Scheduler) dispatchOne(ctx context.Context, deliveryID common.DeliveryID, now time.Time) error { workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID) if err != nil { return fmt.Errorf("dispatch due delivery %q: load work item: %w", deliveryID, err) } if !found { if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil { return fmt.Errorf("dispatch due delivery %q: remove stale schedule: %w", deliveryID, err) } return nil } if !isSchedulable(workItem) { if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil { return fmt.Errorf("dispatch due delivery %q: remove unschedulable entry: %w", deliveryID, err) } return nil } ready, err := scheduler.service.Prepare(ctx, workItem) if err != nil { return fmt.Errorf("dispatch due delivery %q: prepare attempt: %w", deliveryID, err) } if !ready { return nil } claimed, found, err := scheduler.store.ClaimDueAttempt(ctx, deliveryID, now) if err != nil { return fmt.Errorf("dispatch due delivery %q: claim attempt: %w", deliveryID, err) } if !found { return nil } scheduler.recordStatusTransition(ctx, claimed.Delivery) select { case <-ctx.Done(): return ctx.Err() case scheduler.workQueue <- claimed: logArgs := logging.DeliveryAttemptAttrs(claimed.Delivery, claimed.Attempt) logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...) scheduler.logger.Debug("attempt claimed", logArgs...) return nil } } func (scheduler *Scheduler) recoverExpired(ctx context.Context) error { now := scheduler.clock.Now().UTC().Truncate(time.Millisecond) deadline := now.Add(-(scheduler.attemptTimeout + scheduler.recoveryGrace)) deliveryIDs, err := scheduler.store.SendingDeliveryIDs(ctx) if err != nil { return fmt.Errorf("recover expired attempts: %w", err) } for _, deliveryID := range deliveryIDs { workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID) if err != nil { return fmt.Errorf("recover expired delivery %q: load work item: %w", deliveryID, err) } if !found || !isRecoverable(workItem) || workItem.Attempt.StartedAt == nil { continue } if workItem.Attempt.StartedAt.After(deadline) { continue } if err := scheduler.service.RecoverExpired(ctx, workItem); err != nil { return fmt.Errorf("recover expired delivery %q: %w", deliveryID, err) } logArgs := logging.DeliveryAttemptAttrs(workItem.Delivery, workItem.Attempt) logArgs = append(logArgs, "started_at", workItem.Attempt.StartedAt) logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...) scheduler.logger.Warn("attempt claim expired", logArgs...) } return nil } func (scheduler *Scheduler) recordStatusTransition(ctx context.Context, record deliverydomain.Delivery) { if scheduler == nil || scheduler.telemetry == nil { return } scheduler.telemetry.RecordDeliveryStatusTransition(ctx, string(record.Status), string(record.Source)) } func isSchedulable(item executeattempt.WorkItem) bool { if item.Delivery.AttemptCount != item.Attempt.AttemptNo { return false } switch item.Delivery.Status { case deliverydomain.StatusQueued, deliverydomain.StatusRendered: default: return false } return item.Attempt.Status == attempt.StatusScheduled } func isRecoverable(item executeattempt.WorkItem) bool { if item.Delivery.AttemptCount != item.Attempt.AttemptNo { return false } if item.Delivery.Status != deliverydomain.StatusSending { return false } return item.Attempt.Status == attempt.StatusInProgress }