feat: mail service
This commit is contained in:
@@ -0,0 +1,347 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"galaxy/mail/internal/domain/attempt"
|
||||
"galaxy/mail/internal/domain/common"
|
||||
deliverydomain "galaxy/mail/internal/domain/delivery"
|
||||
"galaxy/mail/internal/logging"
|
||||
"galaxy/mail/internal/service/executeattempt"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultSchedulePollInterval = 250 * time.Millisecond
|
||||
defaultRecoveryInterval = 30 * time.Second
|
||||
defaultRecoveryGrace = 30 * time.Second
|
||||
)
|
||||
|
||||
// AttemptExecutionStore describes the durable state operations used by the
|
||||
// attempt scheduler.
|
||||
type AttemptExecutionStore interface {
|
||||
// NextDueDeliveryIDs returns up to limit due delivery identifiers.
|
||||
NextDueDeliveryIDs(context.Context, time.Time, int64) ([]common.DeliveryID, error)
|
||||
|
||||
// SendingDeliveryIDs returns every delivery currently indexed as sending.
|
||||
SendingDeliveryIDs(context.Context) ([]common.DeliveryID, error)
|
||||
|
||||
// LoadWorkItem loads the current delivery and active attempt for deliveryID.
|
||||
LoadWorkItem(context.Context, common.DeliveryID) (executeattempt.WorkItem, bool, error)
|
||||
|
||||
// ClaimDueAttempt atomically claims the due scheduled attempt for
|
||||
// deliveryID.
|
||||
ClaimDueAttempt(context.Context, common.DeliveryID, time.Time) (executeattempt.WorkItem, bool, error)
|
||||
|
||||
// RemoveScheduledDelivery removes deliveryID from the attempt schedule set.
|
||||
RemoveScheduledDelivery(context.Context, common.DeliveryID) error
|
||||
}
|
||||
|
||||
// AttemptPreparationService prepares queued template deliveries and recovers
|
||||
// stale claimed attempts.
|
||||
type AttemptPreparationService interface {
|
||||
// Prepare renders one queued template delivery when needed and reports
|
||||
// whether the scheduler may continue to claim the attempt.
|
||||
Prepare(context.Context, executeattempt.WorkItem) (bool, error)
|
||||
|
||||
// RecoverExpired marks one stale in-progress attempt as timed out.
|
||||
RecoverExpired(context.Context, executeattempt.WorkItem) error
|
||||
}
|
||||
|
||||
// SchedulerTelemetry records low-cardinality scheduler-side delivery
|
||||
// transitions.
|
||||
type SchedulerTelemetry interface {
|
||||
// RecordDeliveryStatusTransition records one durable delivery status
|
||||
// transition.
|
||||
RecordDeliveryStatusTransition(context.Context, string, string)
|
||||
}
|
||||
|
||||
// SchedulerConfig stores the dependencies used by Scheduler.
|
||||
type SchedulerConfig struct {
|
||||
// Store owns the durable scheduled and in-progress attempt state.
|
||||
Store AttemptExecutionStore
|
||||
|
||||
// Service prepares queued template deliveries and recovers stale claims.
|
||||
Service AttemptPreparationService
|
||||
|
||||
// WorkQueue stores the claimed attempt handoff channel consumed by the
|
||||
// attempt worker pool.
|
||||
WorkQueue chan<- executeattempt.WorkItem
|
||||
|
||||
// Clock provides the scheduler wall clock.
|
||||
Clock Clock
|
||||
|
||||
// AttemptTimeout stores the provider execution budget used to derive claim
|
||||
// recovery deadlines.
|
||||
AttemptTimeout time.Duration
|
||||
|
||||
// Telemetry records scheduler-side delivery transitions.
|
||||
Telemetry SchedulerTelemetry
|
||||
|
||||
// PollInterval overrides the default due-attempt polling interval when
|
||||
// positive.
|
||||
PollInterval time.Duration
|
||||
|
||||
// RecoveryInterval overrides the default stale-claim recovery interval when
|
||||
// positive.
|
||||
RecoveryInterval time.Duration
|
||||
|
||||
// RecoveryGrace overrides the default stale-claim grace window when
|
||||
// positive.
|
||||
RecoveryGrace time.Duration
|
||||
}
|
||||
|
||||
// Scheduler polls due attempts, optionally renders queued template
|
||||
// deliveries, atomically claims runnable work, and recovers stale in-progress
|
||||
// ownership.
|
||||
type Scheduler struct {
|
||||
store AttemptExecutionStore
|
||||
service AttemptPreparationService
|
||||
workQueue chan<- executeattempt.WorkItem
|
||||
clock Clock
|
||||
attemptTimeout time.Duration
|
||||
telemetry SchedulerTelemetry
|
||||
pollInterval time.Duration
|
||||
recoveryInterval time.Duration
|
||||
recoveryGrace time.Duration
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewScheduler constructs one attempt scheduler.
|
||||
func NewScheduler(cfg SchedulerConfig, logger *slog.Logger) (*Scheduler, error) {
|
||||
switch {
|
||||
case cfg.Store == nil:
|
||||
return nil, errors.New("new scheduler: nil attempt execution store")
|
||||
case cfg.Service == nil:
|
||||
return nil, errors.New("new scheduler: nil attempt preparation service")
|
||||
case cfg.WorkQueue == nil:
|
||||
return nil, errors.New("new scheduler: nil work queue")
|
||||
case cfg.Clock == nil:
|
||||
return nil, errors.New("new scheduler: nil clock")
|
||||
case cfg.AttemptTimeout <= 0:
|
||||
return nil, errors.New("new scheduler: non-positive attempt timeout")
|
||||
}
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
pollInterval := cfg.PollInterval
|
||||
if pollInterval <= 0 {
|
||||
pollInterval = defaultSchedulePollInterval
|
||||
}
|
||||
|
||||
recoveryInterval := cfg.RecoveryInterval
|
||||
if recoveryInterval <= 0 {
|
||||
recoveryInterval = defaultRecoveryInterval
|
||||
}
|
||||
|
||||
recoveryGrace := cfg.RecoveryGrace
|
||||
if recoveryGrace <= 0 {
|
||||
recoveryGrace = defaultRecoveryGrace
|
||||
}
|
||||
|
||||
return &Scheduler{
|
||||
store: cfg.Store,
|
||||
service: cfg.Service,
|
||||
workQueue: cfg.WorkQueue,
|
||||
clock: cfg.Clock,
|
||||
attemptTimeout: cfg.AttemptTimeout,
|
||||
telemetry: cfg.Telemetry,
|
||||
pollInterval: pollInterval,
|
||||
recoveryInterval: recoveryInterval,
|
||||
recoveryGrace: recoveryGrace,
|
||||
logger: logger.With(
|
||||
"component", "scheduler",
|
||||
"poll_interval", pollInterval.String(),
|
||||
"recovery_interval", recoveryInterval.String(),
|
||||
"recovery_grace", recoveryGrace.String(),
|
||||
),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Run starts the scheduler loop and blocks until ctx is canceled or one
|
||||
// durable state operation fails.
|
||||
func (scheduler *Scheduler) Run(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("run scheduler: nil context")
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if scheduler == nil {
|
||||
return errors.New("run scheduler: nil scheduler")
|
||||
}
|
||||
|
||||
scheduler.logger.Info("scheduler started")
|
||||
defer scheduler.logger.Info("scheduler stopped")
|
||||
|
||||
if err := scheduler.recoverExpired(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pollTicker := time.NewTicker(scheduler.pollInterval)
|
||||
defer pollTicker.Stop()
|
||||
|
||||
recoveryTicker := time.NewTicker(scheduler.recoveryInterval)
|
||||
defer recoveryTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-pollTicker.C:
|
||||
if err := scheduler.dispatchDueAttempts(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
case <-recoveryTicker.C:
|
||||
if err := scheduler.recoverExpired(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown stops the scheduler within ctx. The scheduler does not own
|
||||
// additional resources beyond its run loop.
|
||||
func (scheduler *Scheduler) Shutdown(ctx context.Context) error {
|
||||
if ctx == nil {
|
||||
return errors.New("shutdown scheduler: nil context")
|
||||
}
|
||||
if scheduler == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (scheduler *Scheduler) dispatchDueAttempts(ctx context.Context) error {
|
||||
for {
|
||||
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
|
||||
deliveryIDs, err := scheduler.store.NextDueDeliveryIDs(ctx, now, 1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch due attempts: %w", err)
|
||||
}
|
||||
if len(deliveryIDs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := scheduler.dispatchOne(ctx, deliveryIDs[0], now); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (scheduler *Scheduler) dispatchOne(ctx context.Context, deliveryID common.DeliveryID, now time.Time) error {
|
||||
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch due delivery %q: load work item: %w", deliveryID, err)
|
||||
}
|
||||
if !found {
|
||||
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
|
||||
return fmt.Errorf("dispatch due delivery %q: remove stale schedule: %w", deliveryID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if !isSchedulable(workItem) {
|
||||
if err := scheduler.store.RemoveScheduledDelivery(ctx, deliveryID); err != nil {
|
||||
return fmt.Errorf("dispatch due delivery %q: remove unschedulable entry: %w", deliveryID, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
ready, err := scheduler.service.Prepare(ctx, workItem)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch due delivery %q: prepare attempt: %w", deliveryID, err)
|
||||
}
|
||||
if !ready {
|
||||
return nil
|
||||
}
|
||||
|
||||
claimed, found, err := scheduler.store.ClaimDueAttempt(ctx, deliveryID, now)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch due delivery %q: claim attempt: %w", deliveryID, err)
|
||||
}
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
scheduler.recordStatusTransition(ctx, claimed.Delivery)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case scheduler.workQueue <- claimed:
|
||||
logArgs := logging.DeliveryAttemptAttrs(claimed.Delivery, claimed.Attempt)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
scheduler.logger.Debug("attempt claimed", logArgs...)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (scheduler *Scheduler) recoverExpired(ctx context.Context) error {
|
||||
now := scheduler.clock.Now().UTC().Truncate(time.Millisecond)
|
||||
deadline := now.Add(-(scheduler.attemptTimeout + scheduler.recoveryGrace))
|
||||
|
||||
deliveryIDs, err := scheduler.store.SendingDeliveryIDs(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("recover expired attempts: %w", err)
|
||||
}
|
||||
|
||||
for _, deliveryID := range deliveryIDs {
|
||||
workItem, found, err := scheduler.store.LoadWorkItem(ctx, deliveryID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("recover expired delivery %q: load work item: %w", deliveryID, err)
|
||||
}
|
||||
if !found || !isRecoverable(workItem) || workItem.Attempt.StartedAt == nil {
|
||||
continue
|
||||
}
|
||||
if workItem.Attempt.StartedAt.After(deadline) {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := scheduler.service.RecoverExpired(ctx, workItem); err != nil {
|
||||
return fmt.Errorf("recover expired delivery %q: %w", deliveryID, err)
|
||||
}
|
||||
|
||||
logArgs := logging.DeliveryAttemptAttrs(workItem.Delivery, workItem.Attempt)
|
||||
logArgs = append(logArgs, "started_at", workItem.Attempt.StartedAt)
|
||||
logArgs = append(logArgs, logging.TraceAttrsFromContext(ctx)...)
|
||||
scheduler.logger.Warn("attempt claim expired", logArgs...)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (scheduler *Scheduler) recordStatusTransition(ctx context.Context, record deliverydomain.Delivery) {
|
||||
if scheduler == nil || scheduler.telemetry == nil {
|
||||
return
|
||||
}
|
||||
|
||||
scheduler.telemetry.RecordDeliveryStatusTransition(ctx, string(record.Status), string(record.Source))
|
||||
}
|
||||
|
||||
func isSchedulable(item executeattempt.WorkItem) bool {
|
||||
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
|
||||
return false
|
||||
}
|
||||
switch item.Delivery.Status {
|
||||
case deliverydomain.StatusQueued, deliverydomain.StatusRendered:
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
return item.Attempt.Status == attempt.StatusScheduled
|
||||
}
|
||||
|
||||
func isRecoverable(item executeattempt.WorkItem) bool {
|
||||
if item.Delivery.AttemptCount != item.Attempt.AttemptNo {
|
||||
return false
|
||||
}
|
||||
if item.Delivery.Status != deliverydomain.StatusSending {
|
||||
return false
|
||||
}
|
||||
|
||||
return item.Attempt.Status == attempt.StatusInProgress
|
||||
}
|
||||
Reference in New Issue
Block a user