galaxy-game/rtmanager/internal/service/stopruntime/service.go

// Package stopruntime implements the `stop` lifecycle operation owned by
// Runtime Manager. The service is the single orchestrator behind both
// the asynchronous `runtime:stop_jobs` consumer and the synchronous
// `POST /api/v1/internal/runtimes/{game_id}/stop` REST handler. It is
// also the inner stop step of the restart and patch services, which
// call Run while holding the outer per-game lease.
//
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
// §Lifecycles → Stop`. Design rationale is captured in
// `rtmanager/docs/services.md`.
package stopruntime

import (
	"context"
	"crypto/rand"
	"encoding/base64"
	"encoding/json"
	"errors"
	"fmt"
	"log/slog"
	"strings"
	"time"

	"galaxy/rtmanager/internal/config"
	"galaxy/rtmanager/internal/domain/health"
	"galaxy/rtmanager/internal/domain/operation"
	"galaxy/rtmanager/internal/domain/runtime"
	"galaxy/rtmanager/internal/logging"
	"galaxy/rtmanager/internal/ports"
	"galaxy/rtmanager/internal/service/startruntime"
	"galaxy/rtmanager/internal/telemetry"
)

// leaseReleaseTimeout bounds the deferred lease-release call. A fresh
// background context is used so the release runs even when the request
// context was already canceled.
const leaseReleaseTimeout = 5 * time.Second

// Input stores the per-call arguments for one stop operation.
type Input struct {
	// GameID identifies the platform game to stop.
	GameID string

	// Reason classifies the trigger of the stop. Required.
	Reason StopReason

	// OpSource classifies how the request entered Runtime Manager.
	// Required: every operation_log entry carries an op_source.
	OpSource operation.OpSource

	// SourceRef stores the optional opaque per-source reference (Redis
	// Stream entry id, REST request id, admin user id). Empty when the
	// caller does not provide one. For inner calls invoked by the
	// restart and patch orchestrators it carries the outer correlation
	// id so the three operation_log entries share it.
	SourceRef string
}

// Validate reports whether input carries the structural invariants the
// service requires.
func (input Input) Validate() error {
	if strings.TrimSpace(input.GameID) == "" {
		return fmt.Errorf("game id must not be empty")
	}
	if !input.OpSource.IsKnown() {
		return fmt.Errorf("op source %q is unsupported", input.OpSource)
	}
	if err := input.Reason.Validate(); err != nil {
		return err
	}
	return nil
}

// Result stores the deterministic outcome of one Handle / Run call.
type Result struct {
	// Record carries the runtime record installed by the operation.
	// Populated on success and on idempotent replay; zero on failure.
	Record runtime.RuntimeRecord

	// Outcome reports whether the operation completed (success) or
	// produced a stable failure code.
	Outcome operation.Outcome

	// ErrorCode stores the stable error code on failure, or
	// `replay_no_op` on idempotent replay. Empty for fresh successes.
	ErrorCode string

	// ErrorMessage stores the operator-readable detail on failure.
	// Empty for successes.
	ErrorMessage string
}

// Dependencies groups the collaborators required by Service.
type Dependencies struct {
	// RuntimeRecords reads and updates the durable runtime record.
	RuntimeRecords ports.RuntimeRecordStore

	// OperationLogs records the success / failure audit entry.
	OperationLogs ports.OperationLogStore

	// Docker drives the Docker daemon (container stop).
	Docker ports.DockerClient

	// Leases serialises operations against the same game id.
	Leases ports.GameLeaseStore

	// HealthEvents publishes `runtime:health_events` and upserts the
	// matching `health_snapshots` row. Used on the vanished-container
	// path to emit `container_disappeared`.
	HealthEvents ports.HealthEventPublisher

	// Container groups the per-container settings consumed at stop time
	// (the graceful stop timeout).
	Container config.ContainerConfig

	// Coordination supplies the per-game lease TTL.
	Coordination config.CoordinationConfig

	// Telemetry records stop outcomes and lease latency. Required.
	Telemetry *telemetry.Runtime

	// Logger records structured service-level events. Defaults to
	// `slog.Default()` when nil.
	Logger *slog.Logger

	// Clock supplies the wall-clock used for operation timestamps.
	// Defaults to `time.Now` when nil.
	Clock func() time.Time

	// NewToken supplies a unique opaque lease token. Defaults to a
	// 32-byte random base64url string when nil. Tests may override.
	NewToken func() string
}

// Service executes the stop lifecycle operation.
type Service struct {
	runtimeRecords ports.RuntimeRecordStore
	operationLogs  ports.OperationLogStore
	docker         ports.DockerClient
	leases         ports.GameLeaseStore
	healthEvents   ports.HealthEventPublisher

	stopTimeout time.Duration
	leaseTTL    time.Duration

	telemetry *telemetry.Runtime
	logger    *slog.Logger

	clock    func() time.Time
	newToken func() string
}

// NewService constructs one Service from deps.
func NewService(deps Dependencies) (*Service, error) {
	switch {
	case deps.RuntimeRecords == nil:
		return nil, errors.New("new stop runtime service: nil runtime records")
	case deps.OperationLogs == nil:
		return nil, errors.New("new stop runtime service: nil operation logs")
	case deps.Docker == nil:
		return nil, errors.New("new stop runtime service: nil docker client")
	case deps.Leases == nil:
		return nil, errors.New("new stop runtime service: nil lease store")
	case deps.HealthEvents == nil:
		return nil, errors.New("new stop runtime service: nil health events publisher")
	case deps.Telemetry == nil:
		return nil, errors.New("new stop runtime service: nil telemetry runtime")
	}
	if err := deps.Container.Validate(); err != nil {
		return nil, fmt.Errorf("new stop runtime service: container config: %w", err)
	}
	if err := deps.Coordination.Validate(); err != nil {
		return nil, fmt.Errorf("new stop runtime service: coordination config: %w", err)
	}

	clock := deps.Clock
	if clock == nil {
		clock = time.Now
	}
	logger := deps.Logger
	if logger == nil {
		logger = slog.Default()
	}
	logger = logger.With("service", "rtmanager.stopruntime")

	newToken := deps.NewToken
	if newToken == nil {
		newToken = defaultTokenGenerator()
	}

	return &Service{
		runtimeRecords: deps.RuntimeRecords,
		operationLogs:  deps.OperationLogs,
		docker:         deps.Docker,
		leases:         deps.Leases,
		healthEvents:   deps.HealthEvents,
		stopTimeout:    deps.Container.StopTimeout,
		leaseTTL:       deps.Coordination.GameLeaseTTL,
		telemetry:      deps.Telemetry,
		logger:         logger,
		clock:          clock,
		newToken:       newToken,
	}, nil
}

// Handle executes one stop operation end-to-end. The Go-level error
// return is reserved for non-business failures (nil context, nil
// receiver). Every business outcome — success, idempotent replay, or
// any of the stable failure modes — flows through Result.
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
	if service == nil {
		return Result{}, errors.New("stop runtime: nil service")
	}
	if ctx == nil {
		return Result{}, errors.New("stop runtime: nil context")
	}

	opStartedAt := service.clock().UTC()

	if err := input.Validate(); err != nil {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInvalidRequest,
			errorMessage: err.Error(),
		}), nil
	}

	token := service.newToken()
	leaseStart := service.clock()
	acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
	service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
	if err != nil {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeServiceUnavailable,
			errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
		}), nil
	}
	if !acquired {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeConflict,
			errorMessage: "another lifecycle operation is in progress for this game",
		}), nil
	}
	defer service.releaseLease(ctx, input.GameID, token)

	return service.runUnderLease(ctx, input, opStartedAt)
}

// Run executes the stop lifecycle assuming the per-game lease is
// already held by the caller. The method is reserved for orchestrator
// services in `internal/service/` that compose stop with another
// operation under a single outer lease (restart and patch). External
// callers must use Handle.
func (service *Service) Run(ctx context.Context, input Input) (Result, error) {
	if service == nil {
		return Result{}, errors.New("stop runtime: nil service")
	}
	if ctx == nil {
		return Result{}, errors.New("stop runtime: nil context")
	}

	opStartedAt := service.clock().UTC()

	if err := input.Validate(); err != nil {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInvalidRequest,
			errorMessage: err.Error(),
		}), nil
	}

	return service.runUnderLease(ctx, input, opStartedAt)
}

// runUnderLease executes the post-validation, lease-protected stop
// steps shared by Handle and Run.
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
	existing, err := service.runtimeRecords.Get(ctx, input.GameID)
	if errors.Is(err, runtime.ErrNotFound) {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeNotFound,
			errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
		}), nil
	}
	if err != nil {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInternal,
			errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
		}), nil
	}

	switch existing.Status {
	case runtime.StatusStopped, runtime.StatusRemoved:
		return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
	case runtime.StatusRunning:
		// proceed
	default:
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInternal,
			errorMessage: fmt.Sprintf("runtime record has unsupported status %q", existing.Status),
		}), nil
	}

	if err := service.docker.Stop(ctx, existing.CurrentContainerID, service.stopTimeout); err != nil {
		if errors.Is(err, ports.ErrContainerNotFound) {
			return service.handleVanished(ctx, input, opStartedAt, existing), nil
		}
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeServiceUnavailable,
			errorMessage: fmt.Sprintf("docker stop: %s", err.Error()),
			containerID:  existing.CurrentContainerID,
			imageRef:     existing.CurrentImageRef,
		}), nil
	}

	updateNow := service.clock().UTC()
	err = service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
		GameID:              input.GameID,
		ExpectedFrom:        runtime.StatusRunning,
		ExpectedContainerID: existing.CurrentContainerID,
		To:                  runtime.StatusStopped,
		Now:                 updateNow,
	})
	if errors.Is(err, runtime.ErrConflict) {
		// CAS race: a concurrent reconciler / restart already moved the
		// record. The desired terminal state was reached by another path.
		return service.recordReplayNoOp(ctx, opStartedAt, input, existing), nil
	}
	if errors.Is(err, runtime.ErrNotFound) {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeNotFound,
			errorMessage: fmt.Sprintf("runtime record for game %q vanished mid-stop", input.GameID),
			containerID:  existing.CurrentContainerID,
			imageRef:     existing.CurrentImageRef,
		}), nil
	}
	if err != nil {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInternal,
			errorMessage: fmt.Sprintf("update runtime status: %s", err.Error()),
			containerID:  existing.CurrentContainerID,
			imageRef:     existing.CurrentImageRef,
		}), nil
	}

	finishedAt := service.clock().UTC()
	service.bestEffortAppend(ctx, operation.OperationEntry{
		GameID:      input.GameID,
		OpKind:      operation.OpKindStop,
		OpSource:    input.OpSource,
		SourceRef:   input.SourceRef,
		ImageRef:    existing.CurrentImageRef,
		ContainerID: existing.CurrentContainerID,
		Outcome:     operation.OutcomeSuccess,
		StartedAt:   opStartedAt,
		FinishedAt:  &finishedAt,
	})
	service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))

	record := existing
	record.Status = runtime.StatusStopped
	stoppedAt := updateNow
	record.StoppedAt = &stoppedAt
	record.LastOpAt = updateNow

	logArgs := []any{
		"game_id", input.GameID,
		"container_id", existing.CurrentContainerID,
		"reason", string(input.Reason),
		"op_source", string(input.OpSource),
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	service.logger.InfoContext(ctx, "runtime stopped", logArgs...)

	return Result{
		Record:  record,
		Outcome: operation.OutcomeSuccess,
	}, nil
}

// handleVanished records the success outcome for the case where docker
// stop reports the container as already gone. It updates the record to
// removed, publishes container_disappeared, and returns success.
func (service *Service) handleVanished(ctx context.Context, input Input, opStartedAt time.Time, existing runtime.RuntimeRecord) Result {
	updateNow := service.clock().UTC()
	err := service.runtimeRecords.UpdateStatus(ctx, ports.UpdateStatusInput{
		GameID:              input.GameID,
		ExpectedFrom:        runtime.StatusRunning,
		ExpectedContainerID: existing.CurrentContainerID,
		To:                  runtime.StatusRemoved,
		Now:                 updateNow,
	})
	if errors.Is(err, runtime.ErrConflict) {
		return service.recordReplayNoOp(ctx, opStartedAt, input, existing)
	}
	if err != nil && !errors.Is(err, runtime.ErrNotFound) {
		return service.recordFailure(ctx, failureCtx{
			opStartedAt:  opStartedAt,
			input:        input,
			errorCode:    startruntime.ErrorCodeInternal,
			errorMessage: fmt.Sprintf("update runtime status to removed: %s", err.Error()),
			containerID:  existing.CurrentContainerID,
			imageRef:     existing.CurrentImageRef,
		})
	}

	service.bestEffortPublishHealth(ctx, ports.HealthEventEnvelope{
		GameID:      input.GameID,
		ContainerID: existing.CurrentContainerID,
		EventType:   health.EventTypeContainerDisappeared,
		OccurredAt:  updateNow,
		Details:     emptyHealthDetails(),
	})

	finishedAt := service.clock().UTC()
	service.bestEffortAppend(ctx, operation.OperationEntry{
		GameID:      input.GameID,
		OpKind:      operation.OpKindStop,
		OpSource:    input.OpSource,
		SourceRef:   input.SourceRef,
		ImageRef:    existing.CurrentImageRef,
		ContainerID: existing.CurrentContainerID,
		Outcome:     operation.OutcomeSuccess,
		StartedAt:   opStartedAt,
		FinishedAt:  &finishedAt,
	})
	service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))
	service.telemetry.RecordHealthEvent(ctx, string(health.EventTypeContainerDisappeared))

	record := existing
	record.Status = runtime.StatusRemoved
	record.CurrentContainerID = ""
	removedAt := updateNow
	record.RemovedAt = &removedAt
	record.LastOpAt = updateNow

	logArgs := []any{
		"game_id", input.GameID,
		"container_id", existing.CurrentContainerID,
		"reason", string(input.Reason),
		"op_source", string(input.OpSource),
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	service.logger.InfoContext(ctx, "runtime stop on vanished container", logArgs...)

	return Result{
		Record:  record,
		Outcome: operation.OutcomeSuccess,
	}
}

// recordReplayNoOp records the idempotent replay outcome and returns the
// existing record unchanged.
func (service *Service) recordReplayNoOp(ctx context.Context, opStartedAt time.Time, input Input, existing runtime.RuntimeRecord) Result {
	finishedAt := service.clock().UTC()
	service.bestEffortAppend(ctx, operation.OperationEntry{
		GameID:      input.GameID,
		OpKind:      operation.OpKindStop,
		OpSource:    input.OpSource,
		SourceRef:   input.SourceRef,
		ImageRef:    existing.CurrentImageRef,
		ContainerID: existing.CurrentContainerID,
		Outcome:     operation.OutcomeSuccess,
		ErrorCode:   startruntime.ErrorCodeReplayNoOp,
		StartedAt:   opStartedAt,
		FinishedAt:  &finishedAt,
	})
	service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeSuccess), string(input.Reason), string(input.OpSource))

	logArgs := []any{
		"game_id", input.GameID,
		"container_id", existing.CurrentContainerID,
		"reason", string(input.Reason),
		"op_source", string(input.OpSource),
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	service.logger.InfoContext(ctx, "runtime stop replay no-op", logArgs...)

	return Result{
		Record:    existing,
		Outcome:   operation.OutcomeSuccess,
		ErrorCode: startruntime.ErrorCodeReplayNoOp,
	}
}

// failureCtx groups the inputs to recordFailure so the runUnderLease
// method stays readable.
type failureCtx struct {
	opStartedAt  time.Time
	input        Input
	errorCode    string
	errorMessage string
	containerID  string
	imageRef     string
}

// recordFailure records the failure operation_log entry and emits
// telemetry. The runtime record stays untouched.
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
	finishedAt := service.clock().UTC()
	service.bestEffortAppend(ctx, operation.OperationEntry{
		GameID:       fc.input.GameID,
		OpKind:       operation.OpKindStop,
		OpSource:     fc.input.OpSource,
		SourceRef:    fc.input.SourceRef,
		ImageRef:     fc.imageRef,
		ContainerID:  fc.containerID,
		Outcome:      operation.OutcomeFailure,
		ErrorCode:    fc.errorCode,
		ErrorMessage: fc.errorMessage,
		StartedAt:    fc.opStartedAt,
		FinishedAt:   &finishedAt,
	})
	service.telemetry.RecordStopOutcome(ctx, string(operation.OutcomeFailure), string(fc.input.Reason), string(fc.input.OpSource))

	logArgs := []any{
		"game_id", fc.input.GameID,
		"reason", string(fc.input.Reason),
		"op_source", string(fc.input.OpSource),
		"error_code", fc.errorCode,
		"error_message", fc.errorMessage,
	}
	logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
	service.logger.WarnContext(ctx, "runtime stop failed", logArgs...)

	return Result{
		Outcome:      operation.OutcomeFailure,
		ErrorCode:    fc.errorCode,
		ErrorMessage: fc.errorMessage,
	}
}

// releaseLease releases the per-game lease in a fresh background context
// so a canceled request context does not leave the lease pinned for its
// TTL.
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
	cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
	defer cancel()
	if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
		service.logger.WarnContext(ctx, "release game lease",
			"game_id", gameID,
			"err", err.Error(),
		)
	}
}

// bestEffortAppend writes one operation_log entry. A failure is logged
// and discarded; the durable runtime record (or its absence) remains
// the source of truth.
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
	if _, err := service.operationLogs.Append(ctx, entry); err != nil {
		service.logger.ErrorContext(ctx, "append operation log",
			"game_id", entry.GameID,
			"op_kind", string(entry.OpKind),
			"outcome", string(entry.Outcome),
			"error_code", entry.ErrorCode,
			"err", err.Error(),
		)
	}
}

// bestEffortPublishHealth emits one health event + snapshot upsert.
// Failures degrade silently per `rtmanager/README.md §Notification
// Contracts`; the runtime record remains the source of truth.
func (service *Service) bestEffortPublishHealth(ctx context.Context, envelope ports.HealthEventEnvelope) {
	if err := service.healthEvents.Publish(ctx, envelope); err != nil {
		service.logger.ErrorContext(ctx, "publish health event",
			"game_id", envelope.GameID,
			"container_id", envelope.ContainerID,
			"event_type", string(envelope.EventType),
			"err", err.Error(),
		)
	}
}

// defaultTokenGenerator returns a function that produces 32-byte
// base64url-encoded tokens. Mirrors the start service: a degraded
// entropy source falls back to a sentinel token so the next TryAcquire
// observes a collision rather than a panic.
func defaultTokenGenerator() func() string {
	return func() string {
		var buf [32]byte
		if _, err := rand.Read(buf[:]); err != nil {
			return "rtmanager-fallback-token"
		}
		return base64.RawURLEncoding.EncodeToString(buf[:])
	}
}

// emptyHealthDetails returns the canonical empty-object payload required
// by the `container_disappeared` AsyncAPI variant.
func emptyHealthDetails() json.RawMessage {
	return json.RawMessage("{}")
}