348 lines
10 KiB
Go
348 lines
10 KiB
Go
package worker
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"galaxy/mail/internal/domain/attempt"
|
|
"galaxy/mail/internal/domain/common"
|
|
deliverydomain "galaxy/mail/internal/domain/delivery"
|
|
"galaxy/mail/internal/logging"
|
|
"galaxy/mail/internal/service/executeattempt"
|
|
)
|
|
|
|
const (
|
|
defaultSchedulePollInterval = 250 * time.Millisecond
|
|
defaultRecoveryInterval = 30 * time.Second
|
|
defaultRecoveryGrace = 30 * time.Second
|
|
)
|
|
|
|
// AttemptExecutionStore describes the durable state operations used by the
|
|
// attempt scheduler.
|
|
type AttemptExecutionStore interface {
|
|
// NextDueDeliveryIDs returns up to limit due delivery identifiers.
|
|
NextDueDeliveryIDs(context.Context, time.Time, int64) ([]common.DeliveryID, error)
|
|
|
|
// SendingDeliveryIDs returns every delivery currently indexed as sending.
|
|
SendingDeliveryIDs(context.Context) ([]common.DeliveryID, error)
|
|
|
|
// LoadWorkItem loads the current delivery and active attempt for deliveryID.
|
|
LoadWorkItem(context.Context, common.DeliveryID) (executeattempt.WorkItem, bool, error)
|
|
|
|
// ClaimDueAttempt atomically claims the due scheduled attempt for
|
|
// deliveryID.
|
|
ClaimDueAttempt(context.Context, common.DeliveryID, time.Time) (executeattempt.WorkItem, bool, error)
|
|
|
|
// RemoveScheduledDelivery removes deliveryID from the attempt schedule set.
|
|
RemoveScheduledDelivery(context.Context, common.DeliveryID) error
|
|
}
|
|
|
|
// AttemptPreparationService prepares queued template deliveries and recovers
|
|
// stale claimed attempts.
|
|
type AttemptPreparationService interface {
|
|
// Prepare renders one queued template delivery when needed and reports
|
|
// whether the scheduler may continue to claim the attempt.
|
|
Prepare(context.Context, executeattempt.WorkItem) (bool, error)
|
|
|
|
// RecoverExpired marks one stale in-progress attempt as timed out.
|
|
RecoverExpired(context.Context, executeattempt.WorkItem) error
|
|
}
|
|
|
|
// SchedulerTelemetry records low-cardinality scheduler-side delivery
|
|
// transitions.
|
|
type SchedulerTelemetry interface {
|
|
// RecordDeliveryStatusTransition records one durable delivery status
|
|
// transition.
|
|
RecordDeliveryStatusTransition(context.Context, string, string)
|
|
}
|
|
|
|
// SchedulerConfig stores the dependencies used by Scheduler.
|
|
type SchedulerConfig struct {
|
|
// Store owns the durable scheduled and in-progress attempt state.
|
|
Store AttemptExecutionStore
|
|
|
|
// Service prepares queued template deliveries and recovers stale claims.
|
|
Service AttemptPreparationService
|
|
|
|
// WorkQueue stores the claimed attempt handoff channel consumed by the
|
|
// attempt worker pool.
|
|
WorkQueue chan<- executeattempt.WorkItem
|
|
|
|
// Clock provides the scheduler wall clock.
|
|
Clock Clock
|
|
|
|
// AttemptTimeout stores the provider execution budget used to derive claim
|
|
// recovery deadlines.
|
|
AttemptTimeout time.Duration
|
|
|
|
// Telemetry records scheduler-side delivery transitions.
|
|
Telemetry SchedulerTelemetry
|
|
|
|
// PollInterval overrides the default due-attempt polling interval when
|
|
// positive.
|
|
PollInterval time.Duration
|
|
|
|
// RecoveryInterval overrides the default stale-claim recovery interval when
|
|
// positive.
|
|
RecoveryInterval time.Duration
|
|
|
|
// RecoveryGrace overrides the default stale-claim grace window when
|
|
// positive.
|
|
RecoveryGrace time.Duration
|
|
}
|
|
|
|
// Scheduler polls due attempts, optionally renders queued template
|
|
// deliveries, atomically claims runnable work, and recovers stale in-progress
|
|
// ownership.
|
|
type Scheduler struct {
|
|
store AttemptExecutionStore
|
|
service AttemptPreparationService
|
|
workQueue chan<- executeattempt.WorkItem
|
|
clock Clock
|
|
attemptTimeout time.Duration
|
|
telemetry SchedulerTelemetry
|
|
pollInterval time.Duration
|
|
recoveryInterval time.Duration
|
|
recoveryGrace time.Duration
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewScheduler constructs one attempt scheduler.
|
|
func NewScheduler(cfg SchedulerConfig, logger *slog.Logger) (*Scheduler, error) {
|
|
switch {
|
|
case cfg.Store == nil:
|
|
return nil, errors.New("new scheduler: nil attempt execution store")
|
|
case cfg.Service == nil:
|
|
return nil, errors.New("new scheduler: nil attempt preparation service")
|
|
case cfg.WorkQueue == nil:
|
|
return nil, errors.New("new scheduler: nil work queue")
|
|
case cfg.Clock == nil:
|
|
return nil, errors.New("new scheduler: nil clock")
|
|
case cfg.AttemptTimeout <= 0:
|
|
return nil, errors.New("new scheduler: non-positive attempt timeout")
|
|
}
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
pollInterval := cfg.PollInterval
|
|
if pollInterval <= 0 {
|
|
pollInterval = defaultSchedulePollInterval
|
|
}
|
|
|
|
recoveryInterval := cfg.RecoveryInterval
|
|
if recoveryInterval <= 0 {
|
|
recoveryInterval = defaultRecoveryInterval
|
|
}
|
|
|
|
recoveryGrace := cfg.RecoveryGrace
|
|
if recoveryGrace <= 0 {
|
|
recoveryGrace = defaultRecoveryGrace
|
|
}
|
|
|
|
return &Scheduler{
|
|
store: cfg.Store,
|
|
service: cfg.Service,
|
|
workQueue: cfg.WorkQueue,
|
|
clock: cfg.Clock,
|
|
attemptTimeout: cfg.AttemptTimeout,
|
|
telemetry: cfg.Telemetry,
|
|
pollInterval: pollInterval,
|
|
recoveryInterval: recoveryInterval,
|
|
recoveryGrace: recoveryGrace,
|
|
logger: logger.With(
|
|
"component", "scheduler",
|
|
"poll_interval", pollInterval.String(),
|
|
"recovery_interval", recoveryInterval.String(),
|
|
"recovery_grace", recoveryGrace.String(),
|
|
),
|
|
}, nil
|
|
}
|
|
|
|
// Run starts the scheduler loop and blocks until ctx is canceled or one
|
|
// durable state operation fails.
|
|
func (scheduler *Scheduler) Run(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("run scheduler: nil context")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if scheduler == nil {
|
|
return errors.New("run scheduler: nil scheduler")
|
|
}
|
|
|
|
scheduler.logger.Info("scheduler started")
|
|
defer scheduler.logger.Info("scheduler stopped")
|
|
|
|
if err := scheduler.recoverExpired(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
pollTicker := time.NewTicker(scheduler.pollInterval)
|
|
defer pollTicker.Stop()
|
|
|
|
recoveryTicker := time.NewTicker(scheduler.recoveryInterval)
|
|
defer recoveryTicker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-pollTicker.C:
|
|
if err := scheduler.dispatchDueAttempts(ctx); err != nil {
|
|
return err
|
|
}
|
|
case <-recoveryTicker.C:
|
|
if err := scheduler.recoverExpired(ctx); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown stops the scheduler within ctx. The scheduler does not own
|
|
// additional resources beyond its run loop.
|
|
func (scheduler *Scheduler) Shutdown(ctx context.Context) error {
|
|
if ctx == nil {
|
|
return errors.New("shutdown scheduler: nil context")
|
|
}
|
|
if scheduler == nil {
|
|
return nil
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (scheduler *Scheduler) dispatchDueAttempts(ctx context.Context) error {
|
|
for {
|
|
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
|
|
deliveryIDs, err := scheduler.store.NextDueDeliveryIDs(ctx, now, 1)
|
|
if err != nil {
|
|
return fmt.Errorf("dispatch due attempts: %w", err)
|
|
}
|
|
if len(deliveryIDs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if err := scheduler.dispatchOne(ctx, deliveryIDs[0], now); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
func (scheduler *Scheduler) dispatchOne(ctx context.Context, deliveryID common.DeliveryID, now time.Time) error {
|
|
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
|
|
if err != nil {
|
|
return fmt.Errorf("dispatch due delivery %q: load work item: %w", deliveryID, err)
|
|
}
|
|
if !found {
|
|
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
|
|
return fmt.Errorf("dispatch due delivery %q: remove stale schedule: %w", deliveryID, err)
|
|
}
|
|
return nil
|
|
}
|
|
if !isSchedulable(workItem) {
|
|
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
|
|
return fmt.Errorf("dispatch due delivery %q: remove unschedulable entry: %w", deliveryID, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
ready, err := scheduler.service.Prepare(ctx, workItem)
|
|
if err != nil {
|
|
return fmt.Errorf("dispatch due delivery %q: prepare attempt: %w", deliveryID, err)
|
|
}
|
|
if !ready {
|
|
return nil
|
|
}
|
|
|
|
claimed, found, err := scheduler.store.ClaimDueAttempt(ctx, deliveryID, now)
|
|
if err != nil {
|
|
return fmt.Errorf("dispatch due delivery %q: claim attempt: %w", deliveryID, err)
|
|
}
|
|
if !found {
|
|
return nil
|
|
}
|
|
scheduler.recordStatusTransition(ctx, claimed.Delivery)
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case scheduler.workQueue <- claimed:
|
|
logArgs := logging.DeliveryAttemptAttrs(claimed.Delivery, claimed.Attempt)
|
|
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
|
scheduler.logger.Debug("attempt claimed", logArgs...)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func (scheduler *Scheduler) recoverExpired(ctx context.Context) error {
|
|
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
|
|
deadline := now.Add(-(scheduler.attemptTimeout + scheduler.recoveryGrace))
|
|
|
|
deliveryIDs, err := scheduler.store.SendingDeliveryIDs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("recover expired attempts: %w", err)
|
|
}
|
|
|
|
for _, deliveryID := range deliveryIDs {
|
|
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
|
|
if err != nil {
|
|
return fmt.Errorf("recover expired delivery %q: load work item: %w", deliveryID, err)
|
|
}
|
|
if !found || !isRecoverable(workItem) || workItem.Attempt.StartedAt == nil {
|
|
continue
|
|
}
|
|
if workItem.Attempt.StartedAt.After(deadline) {
|
|
continue
|
|
}
|
|
|
|
if err := scheduler.service.RecoverExpired(ctx, workItem); err != nil {
|
|
return fmt.Errorf("recover expired delivery %q: %w", deliveryID, err)
|
|
}
|
|
|
|
logArgs := logging.DeliveryAttemptAttrs(workItem.Delivery, workItem.Attempt)
|
|
logArgs = append(logArgs, "started_at", workItem.Attempt.StartedAt)
|
|
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
|
scheduler.logger.Warn("attempt claim expired", logArgs...)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (scheduler *Scheduler) recordStatusTransition(ctx context.Context, record deliverydomain.Delivery) {
|
|
if scheduler == nil || scheduler.telemetry == nil {
|
|
return
|
|
}
|
|
|
|
scheduler.telemetry.RecordDeliveryStatusTransition(ctx, string(record.Status), string(record.Source))
|
|
}
|
|
|
|
func isSchedulable(item executeattempt.WorkItem) bool {
|
|
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
|
|
return false
|
|
}
|
|
switch item.Delivery.Status {
|
|
case deliverydomain.StatusQueued, deliverydomain.StatusRendered:
|
|
default:
|
|
return false
|
|
}
|
|
|
|
return item.Attempt.Status == attempt.StatusScheduled
|
|
}
|
|
|
|
func isRecoverable(item executeattempt.WorkItem) bool {
|
|
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
|
|
return false
|
|
}
|
|
if item.Delivery.Status != deliverydomain.StatusSending {
|
|
return false
|
|
}
|
|
|
|
return item.Attempt.Status == attempt.StatusInProgress
|
|
}
|