Files
galaxy-game/mail/internal/worker/scheduler.go
T
2026-04-17 18:39:16 +02:00

348 lines
10 KiB
Go

package worker
import (
"context"
"errors"
"fmt"
"log/slog"
"time"
"galaxy/mail/internal/domain/attempt"
"galaxy/mail/internal/domain/common"
deliverydomain "galaxy/mail/internal/domain/delivery"
"galaxy/mail/internal/logging"
"galaxy/mail/internal/service/executeattempt"
)
const (
defaultSchedulePollInterval = 250 * time.Millisecond
defaultRecoveryInterval = 30 * time.Second
defaultRecoveryGrace = 30 * time.Second
)
// AttemptExecutionStore describes the durable state operations used by the
// attempt scheduler.
type AttemptExecutionStore interface {
// NextDueDeliveryIDs returns up to limit due delivery identifiers.
NextDueDeliveryIDs(context.Context, time.Time, int64) ([]common.DeliveryID, error)
// SendingDeliveryIDs returns every delivery currently indexed as sending.
SendingDeliveryIDs(context.Context) ([]common.DeliveryID, error)
// LoadWorkItem loads the current delivery and active attempt for deliveryID.
LoadWorkItem(context.Context, common.DeliveryID) (executeattempt.WorkItem, bool, error)
// ClaimDueAttempt atomically claims the due scheduled attempt for
// deliveryID.
ClaimDueAttempt(context.Context, common.DeliveryID, time.Time) (executeattempt.WorkItem, bool, error)
// RemoveScheduledDelivery removes deliveryID from the attempt schedule set.
RemoveScheduledDelivery(context.Context, common.DeliveryID) error
}
// AttemptPreparationService prepares queued template deliveries and recovers
// stale claimed attempts.
type AttemptPreparationService interface {
// Prepare renders one queued template delivery when needed and reports
// whether the scheduler may continue to claim the attempt.
Prepare(context.Context, executeattempt.WorkItem) (bool, error)
// RecoverExpired marks one stale in-progress attempt as timed out.
RecoverExpired(context.Context, executeattempt.WorkItem) error
}
// SchedulerTelemetry records low-cardinality scheduler-side delivery
// transitions.
type SchedulerTelemetry interface {
// RecordDeliveryStatusTransition records one durable delivery status
// transition.
RecordDeliveryStatusTransition(context.Context, string, string)
}
// SchedulerConfig stores the dependencies used by Scheduler.
type SchedulerConfig struct {
// Store owns the durable scheduled and in-progress attempt state.
Store AttemptExecutionStore
// Service prepares queued template deliveries and recovers stale claims.
Service AttemptPreparationService
// WorkQueue stores the claimed attempt handoff channel consumed by the
// attempt worker pool.
WorkQueue chan<- executeattempt.WorkItem
// Clock provides the scheduler wall clock.
Clock Clock
// AttemptTimeout stores the provider execution budget used to derive claim
// recovery deadlines.
AttemptTimeout time.Duration
// Telemetry records scheduler-side delivery transitions.
Telemetry SchedulerTelemetry
// PollInterval overrides the default due-attempt polling interval when
// positive.
PollInterval time.Duration
// RecoveryInterval overrides the default stale-claim recovery interval when
// positive.
RecoveryInterval time.Duration
// RecoveryGrace overrides the default stale-claim grace window when
// positive.
RecoveryGrace time.Duration
}
// Scheduler polls due attempts, optionally renders queued template
// deliveries, atomically claims runnable work, and recovers stale in-progress
// ownership.
type Scheduler struct {
store AttemptExecutionStore
service AttemptPreparationService
workQueue chan<- executeattempt.WorkItem
clock Clock
attemptTimeout time.Duration
telemetry SchedulerTelemetry
pollInterval time.Duration
recoveryInterval time.Duration
recoveryGrace time.Duration
logger *slog.Logger
}
// NewScheduler constructs one attempt scheduler.
func NewScheduler(cfg SchedulerConfig, logger *slog.Logger) (*Scheduler, error) {
switch {
case cfg.Store == nil:
return nil, errors.New("new scheduler: nil attempt execution store")
case cfg.Service == nil:
return nil, errors.New("new scheduler: nil attempt preparation service")
case cfg.WorkQueue == nil:
return nil, errors.New("new scheduler: nil work queue")
case cfg.Clock == nil:
return nil, errors.New("new scheduler: nil clock")
case cfg.AttemptTimeout <= 0:
return nil, errors.New("new scheduler: non-positive attempt timeout")
}
if logger == nil {
logger = slog.Default()
}
pollInterval := cfg.PollInterval
if pollInterval <= 0 {
pollInterval = defaultSchedulePollInterval
}
recoveryInterval := cfg.RecoveryInterval
if recoveryInterval <= 0 {
recoveryInterval = defaultRecoveryInterval
}
recoveryGrace := cfg.RecoveryGrace
if recoveryGrace <= 0 {
recoveryGrace = defaultRecoveryGrace
}
return &Scheduler{
store: cfg.Store,
service: cfg.Service,
workQueue: cfg.WorkQueue,
clock: cfg.Clock,
attemptTimeout: cfg.AttemptTimeout,
telemetry: cfg.Telemetry,
pollInterval: pollInterval,
recoveryInterval: recoveryInterval,
recoveryGrace: recoveryGrace,
logger: logger.With(
"component", "scheduler",
"poll_interval", pollInterval.String(),
"recovery_interval", recoveryInterval.String(),
"recovery_grace", recoveryGrace.String(),
),
}, nil
}
// Run starts the scheduler loop and blocks until ctx is canceled or one
// durable state operation fails.
func (scheduler *Scheduler) Run(ctx context.Context) error {
if ctx == nil {
return errors.New("run scheduler: nil context")
}
if err := ctx.Err(); err != nil {
return err
}
if scheduler == nil {
return errors.New("run scheduler: nil scheduler")
}
scheduler.logger.Info("scheduler started")
defer scheduler.logger.Info("scheduler stopped")
if err := scheduler.recoverExpired(ctx); err != nil {
return err
}
pollTicker := time.NewTicker(scheduler.pollInterval)
defer pollTicker.Stop()
recoveryTicker := time.NewTicker(scheduler.recoveryInterval)
defer recoveryTicker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-pollTicker.C:
if err := scheduler.dispatchDueAttempts(ctx); err != nil {
return err
}
case <-recoveryTicker.C:
if err := scheduler.recoverExpired(ctx); err != nil {
return err
}
}
}
}
// Shutdown stops the scheduler within ctx. The scheduler does not own
// additional resources beyond its run loop.
func (scheduler *Scheduler) Shutdown(ctx context.Context) error {
if ctx == nil {
return errors.New("shutdown scheduler: nil context")
}
if scheduler == nil {
return nil
}
return nil
}
func (scheduler *Scheduler) dispatchDueAttempts(ctx context.Context) error {
for {
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
deliveryIDs, err := scheduler.store.NextDueDeliveryIDs(ctx, now, 1)
if err != nil {
return fmt.Errorf("dispatch due attempts: %w", err)
}
if len(deliveryIDs) == 0 {
return nil
}
if err := scheduler.dispatchOne(ctx, deliveryIDs[0], now); err != nil {
return err
}
}
}
func (scheduler *Scheduler) dispatchOne(ctx context.Context, deliveryID common.DeliveryID, now time.Time) error {
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
if err != nil {
return fmt.Errorf("dispatch due delivery %q: load work item: %w", deliveryID, err)
}
if !found {
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
return fmt.Errorf("dispatch due delivery %q: remove stale schedule: %w", deliveryID, err)
}
return nil
}
if !isSchedulable(workItem) {
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
return fmt.Errorf("dispatch due delivery %q: remove unschedulable entry: %w", deliveryID, err)
}
return nil
}
ready, err := scheduler.service.Prepare(ctx, workItem)
if err != nil {
return fmt.Errorf("dispatch due delivery %q: prepare attempt: %w", deliveryID, err)
}
if !ready {
return nil
}
claimed, found, err := scheduler.store.ClaimDueAttempt(ctx, deliveryID, now)
if err != nil {
return fmt.Errorf("dispatch due delivery %q: claim attempt: %w", deliveryID, err)
}
if !found {
return nil
}
scheduler.recordStatusTransition(ctx, claimed.Delivery)
select {
case <-ctx.Done():
return ctx.Err()
case scheduler.workQueue <- claimed:
logArgs := logging.DeliveryAttemptAttrs(claimed.Delivery, claimed.Attempt)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
scheduler.logger.Debug("attempt claimed", logArgs...)
return nil
}
}
func (scheduler *Scheduler) recoverExpired(ctx context.Context) error {
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
deadline := now.Add(-(scheduler.attemptTimeout + scheduler.recoveryGrace))
deliveryIDs, err := scheduler.store.SendingDeliveryIDs(ctx)
if err != nil {
return fmt.Errorf("recover expired attempts: %w", err)
}
for _, deliveryID := range deliveryIDs {
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
if err != nil {
return fmt.Errorf("recover expired delivery %q: load work item: %w", deliveryID, err)
}
if !found || !isRecoverable(workItem) || workItem.Attempt.StartedAt == nil {
continue
}
if workItem.Attempt.StartedAt.After(deadline) {
continue
}
if err := scheduler.service.RecoverExpired(ctx, workItem); err != nil {
return fmt.Errorf("recover expired delivery %q: %w", deliveryID, err)
}
logArgs := logging.DeliveryAttemptAttrs(workItem.Delivery, workItem.Attempt)
logArgs = append(logArgs, "started_at", workItem.Attempt.StartedAt)
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
scheduler.logger.Warn("attempt claim expired", logArgs...)
}
return nil
}
func (scheduler *Scheduler) recordStatusTransition(ctx context.Context, record deliverydomain.Delivery) {
if scheduler == nil || scheduler.telemetry == nil {
return
}
scheduler.telemetry.RecordDeliveryStatusTransition(ctx, string(record.Status), string(record.Source))
}
func isSchedulable(item executeattempt.WorkItem) bool {
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
return false
}
switch item.Delivery.Status {
case deliverydomain.StatusQueued, deliverydomain.StatusRendered:
default:
return false
}
return item.Attempt.Status == attempt.StatusScheduled
}
func isRecoverable(item executeattempt.WorkItem) bool {
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
return false
}
if item.Delivery.Status != deliverydomain.StatusSending {
return false
}
return item.Attempt.Status == attempt.StatusInProgress
}