feat: edge gateway service

2026-04-02 19:18:42 +02:00
parent 8cde99936c
commit 436c97a38b
95 changed files with 20504 additions and 57 deletions
@@ -0,0 +1,178 @@
+// Package app wires the gateway process lifecycle and coordinates component
+// startup and graceful shutdown.
+package app
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+
+	"galaxy/gateway/internal/config"
+)
+
+// Component is a long-lived gateway subsystem that participates in coordinated
+// startup and graceful shutdown.
+type Component interface {
+	// Run starts the component and blocks until it stops.
+	Run(context.Context) error
+
+	// Shutdown stops the component within the provided timeout-bounded context.
+	Shutdown(context.Context) error
+}
+
+// App owns the process-level lifecycle of the gateway and its registered
+// components.
+type App struct {
+	cfg        config.Config
+	components []Component
+}
+
+// New constructs an App with a defensive copy of the supplied components.
+func New(cfg config.Config, components ...Component) *App {
+	clonedComponents := append([]Component(nil), components...)
+
+	return &App{
+		cfg:        cfg,
+		components: clonedComponents,
+	}
+}
+
+// Run starts all configured components, waits for cancellation or the first
+// component failure, and then executes best-effort graceful shutdown for every
+// component.
+func (a *App) Run(ctx context.Context) error {
+	if ctx == nil {
+		return errors.New("run gateway app: nil context")
+	}
+	if err := a.validate(); err != nil {
+		return err
+	}
+	if len(a.components) == 0 {
+		<-ctx.Done()
+		return nil
+	}
+
+	runCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	results := make(chan componentResult, len(a.components))
+	var runWG sync.WaitGroup
+
+	for idx, component := range a.components {
+		runWG.Add(1)
+
+		go func(index int, component Component) {
+			defer runWG.Done()
+			results <- componentResult{
+				index: index,
+				err:   component.Run(runCtx),
+			}
+		}(idx, component)
+	}
+
+	var runErr error
+
+	select {
+	case <-ctx.Done():
+	case result := <-results:
+		runErr = classifyComponentResult(ctx, result)
+	}
+
+	cancel()
+
+	shutdownErr := a.shutdownComponents()
+	waitErr := a.waitForComponents(&runWG)
+
+	return errors.Join(runErr, shutdownErr, waitErr)
+}
+
+// componentResult captures the first observed exit from a running component.
+type componentResult struct {
+	index int
+	err   error
+}
+
+// validate confirms that the App has a safe shutdown budget and no nil
+// components before goroutines are started.
+func (a *App) validate() error {
+	if a.cfg.ShutdownTimeout <= 0 {
+		return fmt.Errorf("run gateway app: shutdown timeout must be positive, got %s", a.cfg.ShutdownTimeout)
+	}
+
+	for idx, component := range a.components {
+		if component == nil {
+			return fmt.Errorf("run gateway app: component %d is nil", idx)
+		}
+	}
+
+	return nil
+}
+
+// classifyComponentResult maps the first component exit into the error that
+// should control the application result.
+func classifyComponentResult(parentCtx context.Context, result componentResult) error {
+	switch {
+	case result.err == nil:
+		if parentCtx.Err() != nil {
+			return nil
+		}
+		return fmt.Errorf("run gateway app: component %d exited without error before shutdown", result.index)
+	case errors.Is(result.err, context.Canceled) && parentCtx.Err() != nil:
+		return nil
+	default:
+		return fmt.Errorf("run gateway app: component %d: %w", result.index, result.err)
+	}
+}
+
+// shutdownComponents calls Shutdown on every registered component using a fresh
+// timeout-bounded context per component and joins any shutdown failures.
+func (a *App) shutdownComponents() error {
+	var shutdownWG sync.WaitGroup
+	errs := make(chan error, len(a.components))
+
+	for idx, component := range a.components {
+		shutdownWG.Add(1)
+
+		go func(index int, component Component) {
+			defer shutdownWG.Done()
+
+			shutdownCtx, cancel := context.WithTimeout(context.Background(), a.cfg.ShutdownTimeout)
+			defer cancel()
+
+			if err := component.Shutdown(shutdownCtx); err != nil {
+				errs <- fmt.Errorf("shutdown gateway component %d: %w", index, err)
+			}
+		}(idx, component)
+	}
+
+	shutdownWG.Wait()
+	close(errs)
+
+	var joined error
+	for err := range errs {
+		joined = errors.Join(joined, err)
+	}
+
+	return joined
+}
+
+// waitForComponents waits for running components to return after shutdown and
+// reports when they outlive the configured shutdown budget.
+func (a *App) waitForComponents(runWG *sync.WaitGroup) error {
+	done := make(chan struct{})
+	go func() {
+		runWG.Wait()
+		close(done)
+	}()
+
+	waitCtx, cancel := context.WithTimeout(context.Background(), a.cfg.ShutdownTimeout)
+	defer cancel()
+
+	select {
+	case <-done:
+		return nil
+	case <-waitCtx.Done():
+		return fmt.Errorf("wait for gateway components: %w", waitCtx.Err())
+	}
+}
@@ -0,0 +1,268 @@
+package app
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	"galaxy/gateway/internal/config"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestAppRunWaitsForCancellationWithoutComponents(t *testing.T) {
+	t.Parallel()
+
+	application := New(config.Config{ShutdownTimeout: 50 * time.Millisecond})
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	resultCh := make(chan error, 1)
+	go func() {
+		resultCh <- application.Run(ctx)
+	}()
+
+	select {
+	case err := <-resultCh:
+		require.FailNowf(t, "Run() returned early", "error=%v", err)
+	case <-time.After(50 * time.Millisecond):
+	}
+
+	cancel()
+
+	select {
+	case err := <-resultCh:
+		require.NoError(t, err)
+	case <-time.After(time.Second):
+		require.FailNow(t, "Run() did not return after cancellation")
+	}
+}
+
+func TestAppRunCancelsComponentsAndCallsShutdownOnce(t *testing.T) {
+	t.Parallel()
+
+	first := newLifecycleComponent()
+	second := newLifecycleComponent()
+
+	application := New(
+		config.Config{ShutdownTimeout: time.Second},
+		first,
+		second,
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	resultCh := make(chan error, 1)
+	go func() {
+		resultCh <- application.Run(ctx)
+	}()
+
+	first.waitStarted(t)
+	second.waitStarted(t)
+
+	cancel()
+
+	select {
+	case err := <-resultCh:
+		require.NoError(t, err)
+	case <-time.After(time.Second):
+		require.FailNow(t, "Run() did not return after cancellation")
+	}
+
+	first.waitRunExited(t)
+	second.waitRunExited(t)
+
+	assert.Equal(t, 1, first.shutdownCalls())
+	assert.Equal(t, 1, second.shutdownCalls())
+}
+
+func TestAppRunReturnsComponentErrorAndStillShutsDown(t *testing.T) {
+	t.Parallel()
+
+	runErr := errors.New("boom")
+	failing := newFailingComponent(runErr)
+	blocking := newLifecycleComponent()
+
+	application := New(
+		config.Config{ShutdownTimeout: time.Second},
+		failing,
+		blocking,
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	resultCh := make(chan error, 1)
+	go func() {
+		resultCh <- application.Run(ctx)
+	}()
+
+	failing.waitStarted(t)
+	blocking.waitStarted(t)
+	failing.releaseRun()
+
+	select {
+	case err := <-resultCh:
+		require.Error(t, err)
+		assert.ErrorIs(t, err, runErr)
+	case <-time.After(time.Second):
+		require.FailNow(t, "Run() did not return after component failure")
+	}
+
+	failing.waitRunExited(t)
+	blocking.waitRunExited(t)
+
+	assert.Equal(t, 1, failing.shutdownCalls())
+	assert.Equal(t, 1, blocking.shutdownCalls())
+}
+
+// lifecycleComponent blocks in Run until the application calls Shutdown.
+type lifecycleComponent struct {
+	startedCh   chan struct{}
+	runDoneCh   chan struct{}
+	stopCh      chan struct{}
+	shutdownMu  sync.Mutex
+	shutdownCnt int
+}
+
+// newLifecycleComponent builds a component that exits Run only after Shutdown
+// signals its stop channel.
+func newLifecycleComponent() *lifecycleComponent {
+	return &lifecycleComponent{
+		startedCh: make(chan struct{}),
+		runDoneCh: make(chan struct{}),
+		stopCh:    make(chan struct{}),
+	}
+}
+
+// Run marks the component as started, waits for cancellation, and then blocks
+// until Shutdown releases the stop channel.
+func (c *lifecycleComponent) Run(ctx context.Context) error {
+	close(c.startedCh)
+	defer close(c.runDoneCh)
+
+	<-ctx.Done()
+	<-c.stopCh
+	return nil
+}
+
+// Shutdown records the call and releases the run loop.
+func (c *lifecycleComponent) Shutdown(context.Context) error {
+	c.shutdownMu.Lock()
+	defer c.shutdownMu.Unlock()
+
+	c.shutdownCnt++
+	if c.shutdownCnt == 1 {
+		close(c.stopCh)
+	}
+
+	return nil
+}
+
+// waitStarted blocks until Run has started or fails the test on timeout.
+func (c *lifecycleComponent) waitStarted(t *testing.T) {
+	t.Helper()
+
+	select {
+	case <-c.startedCh:
+	case <-time.After(time.Second):
+		require.FailNow(t, "component did not start")
+	}
+}
+
+// waitRunExited blocks until Run exits or fails the test on timeout.
+func (c *lifecycleComponent) waitRunExited(t *testing.T) {
+	t.Helper()
+
+	select {
+	case <-c.runDoneCh:
+	case <-time.After(time.Second):
+		require.FailNow(t, "component run did not exit")
+	}
+}
+
+// shutdownCalls returns the number of observed Shutdown invocations.
+func (c *lifecycleComponent) shutdownCalls() int {
+	c.shutdownMu.Lock()
+	defer c.shutdownMu.Unlock()
+
+	return c.shutdownCnt
+}
+
+// failingComponent returns a predefined error once released by the test and
+// still tracks shutdown calls.
+type failingComponent struct {
+	startedCh   chan struct{}
+	releaseCh   chan struct{}
+	runDoneCh   chan struct{}
+	shutdownMu  sync.Mutex
+	shutdownCnt int
+	err         error
+}
+
+// newFailingComponent builds a component whose Run returns err after release.
+func newFailingComponent(err error) *failingComponent {
+	return &failingComponent{
+		startedCh: make(chan struct{}),
+		releaseCh: make(chan struct{}),
+		runDoneCh: make(chan struct{}),
+		err:       err,
+	}
+}
+
+// Run waits until the test releases it and then returns the configured error.
+func (c *failingComponent) Run(context.Context) error {
+	close(c.startedCh)
+	defer close(c.runDoneCh)
+
+	<-c.releaseCh
+	return c.err
+}
+
+// Shutdown records that the application attempted graceful shutdown.
+func (c *failingComponent) Shutdown(context.Context) error {
+	c.shutdownMu.Lock()
+	defer c.shutdownMu.Unlock()
+
+	c.shutdownCnt++
+	return nil
+}
+
+// waitStarted blocks until Run has started or fails the test on timeout.
+func (c *failingComponent) waitStarted(t *testing.T) {
+	t.Helper()
+
+	select {
+	case <-c.startedCh:
+	case <-time.After(time.Second):
+		require.FailNow(t, "failing component did not start")
+	}
+}
+
+// releaseRun allows Run to return its configured error.
+func (c *failingComponent) releaseRun() {
+	close(c.releaseCh)
+}
+
+// waitRunExited blocks until Run exits or fails the test on timeout.
+func (c *failingComponent) waitRunExited(t *testing.T) {
+	t.Helper()
+
+	select {
+	case <-c.runDoneCh:
+	case <-time.After(time.Second):
+		require.FailNow(t, "failing component run did not exit")
+	}
+}
+
+// shutdownCalls returns the number of observed Shutdown invocations.
+func (c *failingComponent) shutdownCalls() int {
+	c.shutdownMu.Lock()
+	defer c.shutdownMu.Unlock()
+
+	return c.shutdownCnt
+}