feat: runtime manager
This commit is contained in:
@@ -0,0 +1,483 @@
|
||||
// Package patchruntime implements the `patch` lifecycle operation owned
|
||||
// by Runtime Manager. Patch is restart with a new `image_ref`: under
|
||||
// one outer per-game lease the service runs the stop service, removes
|
||||
// the container, and runs the start service with the new image. The
|
||||
// engine reads its state from the bind-mount on startup, so any data
|
||||
// written before the patch survives.
|
||||
//
|
||||
// The new and current image references must both parse as semver tags
|
||||
// and share their major and minor components. A new tag that bumps the
|
||||
// major or the minor surfaces as `semver_patch_only`; a tag that is
|
||||
// not parseable as semver surfaces as `image_ref_not_semver`. These
|
||||
// pre-checks run before any Docker work so a rejected patch never
|
||||
// disturbs the running runtime.
|
||||
//
|
||||
// Lifecycle and failure-mode semantics follow `rtmanager/README.md
|
||||
// §Lifecycles → Patch`. Design rationale is captured in
|
||||
// `rtmanager/docs/services.md`.
|
||||
package patchruntime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"galaxy/rtmanager/internal/config"
|
||||
"galaxy/rtmanager/internal/domain/operation"
|
||||
"galaxy/rtmanager/internal/domain/runtime"
|
||||
"galaxy/rtmanager/internal/logging"
|
||||
"galaxy/rtmanager/internal/ports"
|
||||
"galaxy/rtmanager/internal/service/startruntime"
|
||||
"galaxy/rtmanager/internal/service/stopruntime"
|
||||
"galaxy/rtmanager/internal/telemetry"
|
||||
)
|
||||
|
||||
const leaseReleaseTimeout = 5 * time.Second
|
||||
|
||||
// Input stores the per-call arguments for one patch operation.
|
||||
type Input struct {
|
||||
// GameID identifies the platform game to patch.
|
||||
GameID string
|
||||
|
||||
// NewImageRef stores the new Docker reference the patch installs.
|
||||
// Must be a valid Docker reference whose tag parses as semver.
|
||||
NewImageRef string
|
||||
|
||||
// OpSource classifies how the request entered Runtime Manager.
|
||||
OpSource operation.OpSource
|
||||
|
||||
// SourceRef stores the optional opaque per-source reference. When
|
||||
// non-empty it is reused as the correlation id linking the outer
|
||||
// patch entry to the inner stop and start log entries.
|
||||
SourceRef string
|
||||
}
|
||||
|
||||
// Validate reports whether input carries the structural invariants the
|
||||
// service requires. Image-reference shape and semver checks happen
|
||||
// later inside Handle so that they run after the runtime record has
|
||||
// been loaded.
|
||||
func (input Input) Validate() error {
|
||||
if strings.TrimSpace(input.GameID) == "" {
|
||||
return fmt.Errorf("game id must not be empty")
|
||||
}
|
||||
if strings.TrimSpace(input.NewImageRef) == "" {
|
||||
return fmt.Errorf("new image ref must not be empty")
|
||||
}
|
||||
if !input.OpSource.IsKnown() {
|
||||
return fmt.Errorf("op source %q is unsupported", input.OpSource)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Result stores the deterministic outcome of one Handle call.
|
||||
type Result struct {
|
||||
// Record carries the runtime record installed by the inner start on
|
||||
// success; zero on failure.
|
||||
Record runtime.RuntimeRecord
|
||||
|
||||
// Outcome reports whether the operation completed (success) or
|
||||
// produced a stable failure code.
|
||||
Outcome operation.Outcome
|
||||
|
||||
// ErrorCode stores the stable error code on failure.
|
||||
ErrorCode string
|
||||
|
||||
// ErrorMessage stores the operator-readable detail on failure.
|
||||
ErrorMessage string
|
||||
}
|
||||
|
||||
// Dependencies groups the collaborators required by Service.
|
||||
type Dependencies struct {
|
||||
RuntimeRecords ports.RuntimeRecordStore
|
||||
OperationLogs ports.OperationLogStore
|
||||
Docker ports.DockerClient
|
||||
Leases ports.GameLeaseStore
|
||||
|
||||
// StopService runs the inner stop step.
|
||||
StopService *stopruntime.Service
|
||||
// StartService runs the inner start step with the new image_ref.
|
||||
StartService *startruntime.Service
|
||||
|
||||
Coordination config.CoordinationConfig
|
||||
|
||||
Telemetry *telemetry.Runtime
|
||||
Logger *slog.Logger
|
||||
Clock func() time.Time
|
||||
NewToken func() string
|
||||
}
|
||||
|
||||
// Service executes the patch lifecycle operation.
|
||||
type Service struct {
|
||||
runtimeRecords ports.RuntimeRecordStore
|
||||
operationLogs ports.OperationLogStore
|
||||
docker ports.DockerClient
|
||||
leases ports.GameLeaseStore
|
||||
stopService *stopruntime.Service
|
||||
startService *startruntime.Service
|
||||
|
||||
leaseTTL time.Duration
|
||||
|
||||
telemetry *telemetry.Runtime
|
||||
logger *slog.Logger
|
||||
|
||||
clock func() time.Time
|
||||
newToken func() string
|
||||
}
|
||||
|
||||
// NewService constructs one Service from deps.
|
||||
func NewService(deps Dependencies) (*Service, error) {
|
||||
switch {
|
||||
case deps.RuntimeRecords == nil:
|
||||
return nil, errors.New("new patch runtime service: nil runtime records")
|
||||
case deps.OperationLogs == nil:
|
||||
return nil, errors.New("new patch runtime service: nil operation logs")
|
||||
case deps.Docker == nil:
|
||||
return nil, errors.New("new patch runtime service: nil docker client")
|
||||
case deps.Leases == nil:
|
||||
return nil, errors.New("new patch runtime service: nil lease store")
|
||||
case deps.StopService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil stop service")
|
||||
case deps.StartService == nil:
|
||||
return nil, errors.New("new patch runtime service: nil start service")
|
||||
case deps.Telemetry == nil:
|
||||
return nil, errors.New("new patch runtime service: nil telemetry runtime")
|
||||
}
|
||||
if err := deps.Coordination.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err)
|
||||
}
|
||||
|
||||
clock := deps.Clock
|
||||
if clock == nil {
|
||||
clock = time.Now
|
||||
}
|
||||
logger := deps.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
logger = logger.With("service", "rtmanager.patchruntime")
|
||||
|
||||
newToken := deps.NewToken
|
||||
if newToken == nil {
|
||||
newToken = defaultTokenGenerator()
|
||||
}
|
||||
|
||||
return &Service{
|
||||
runtimeRecords: deps.RuntimeRecords,
|
||||
operationLogs: deps.OperationLogs,
|
||||
docker: deps.Docker,
|
||||
leases: deps.Leases,
|
||||
stopService: deps.StopService,
|
||||
startService: deps.StartService,
|
||||
leaseTTL: deps.Coordination.GameLeaseTTL,
|
||||
telemetry: deps.Telemetry,
|
||||
logger: logger,
|
||||
clock: clock,
|
||||
newToken: newToken,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Handle executes one patch operation end-to-end. The Go-level error
|
||||
// return is reserved for non-business failures (nil context, nil
|
||||
// receiver). Every business outcome — success or any of the stable
|
||||
// failure codes — flows through Result.
|
||||
func (service *Service) Handle(ctx context.Context, input Input) (Result, error) {
|
||||
if service == nil {
|
||||
return Result{}, errors.New("patch runtime: nil service")
|
||||
}
|
||||
if ctx == nil {
|
||||
return Result{}, errors.New("patch runtime: nil context")
|
||||
}
|
||||
|
||||
opStartedAt := service.clock().UTC()
|
||||
|
||||
if err := input.Validate(); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInvalidRequest,
|
||||
errorMessage: err.Error(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
token := service.newToken()
|
||||
leaseStart := service.clock()
|
||||
acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL)
|
||||
service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart))
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if !acquired {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: "another lifecycle operation is in progress for this game",
|
||||
}), nil
|
||||
}
|
||||
defer service.releaseLease(ctx, input.GameID, token)
|
||||
|
||||
return service.runUnderLease(ctx, input, opStartedAt)
|
||||
}
|
||||
|
||||
// runUnderLease executes the lease-protected patch sequence: load the
|
||||
// runtime record, validate semver compatibility, run inner stop,
|
||||
// remove the container, run inner start with the new image.
|
||||
func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) {
|
||||
existing, err := service.runtimeRecords.Get(ctx, input.GameID)
|
||||
if errors.Is(err, runtime.ErrNotFound) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeNotFound,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()),
|
||||
}), nil
|
||||
}
|
||||
if existing.Status == runtime.StatusRemoved {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeConflict,
|
||||
errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
if strings.TrimSpace(existing.CurrentImageRef) == "" {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID),
|
||||
}), nil
|
||||
}
|
||||
|
||||
currentSemver, err := extractSemverTag(existing.CurrentImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()),
|
||||
imageRef: existing.CurrentImageRef,
|
||||
}), nil
|
||||
}
|
||||
newSemver, err := extractSemverTag(input.NewImageRef)
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeImageRefNotSemver,
|
||||
errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if !samePatchSeries(currentSemver, newSemver) {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeSemverPatchOnly,
|
||||
errorMessage: fmt.Sprintf(
|
||||
"patch must keep major.minor; current=%s new=%s",
|
||||
currentSemver, newSemver,
|
||||
),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
correlationRef := input.SourceRef
|
||||
if correlationRef == "" {
|
||||
correlationRef = service.newToken()
|
||||
}
|
||||
containerID := existing.CurrentContainerID
|
||||
|
||||
stopResult, err := service.stopService.Run(ctx, stopruntime.Input{
|
||||
GameID: input.GameID,
|
||||
Reason: stopruntime.StopReasonAdminRequest,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner stop: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
if stopResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: stopResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
|
||||
if containerID != "" {
|
||||
if err := service.docker.Remove(ctx, containerID); err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeServiceUnavailable,
|
||||
errorMessage: fmt.Sprintf("docker remove: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
containerID: containerID,
|
||||
}), nil
|
||||
}
|
||||
}
|
||||
|
||||
startResult, err := service.startService.Run(ctx, startruntime.Input{
|
||||
GameID: input.GameID,
|
||||
ImageRef: input.NewImageRef,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
})
|
||||
if err != nil {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startruntime.ErrorCodeInternal,
|
||||
errorMessage: fmt.Sprintf("inner start: %s", err.Error()),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
if startResult.Outcome == operation.OutcomeFailure {
|
||||
return service.recordFailure(ctx, failureCtx{
|
||||
opStartedAt: opStartedAt,
|
||||
input: input,
|
||||
errorCode: startResult.ErrorCode,
|
||||
errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage),
|
||||
imageRef: input.NewImageRef,
|
||||
}), nil
|
||||
}
|
||||
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: input.OpSource,
|
||||
SourceRef: correlationRef,
|
||||
ImageRef: input.NewImageRef,
|
||||
ContainerID: startResult.Record.CurrentContainerID,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
StartedAt: opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "")
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", input.GameID,
|
||||
"prev_image_ref", existing.CurrentImageRef,
|
||||
"new_image_ref", input.NewImageRef,
|
||||
"prev_container_id", containerID,
|
||||
"new_container_id", startResult.Record.CurrentContainerID,
|
||||
"op_source", string(input.OpSource),
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.InfoContext(ctx, "runtime patched", logArgs...)
|
||||
|
||||
return Result{
|
||||
Record: startResult.Record,
|
||||
Outcome: operation.OutcomeSuccess,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// failureCtx groups the inputs to recordFailure.
|
||||
type failureCtx struct {
|
||||
opStartedAt time.Time
|
||||
input Input
|
||||
errorCode string
|
||||
errorMessage string
|
||||
imageRef string
|
||||
containerID string
|
||||
}
|
||||
|
||||
// recordFailure writes the outer failure operation_log entry and emits
|
||||
// telemetry. Inner stop / start services have already recorded their
|
||||
// own entries; this is the outer summary.
|
||||
func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result {
|
||||
finishedAt := service.clock().UTC()
|
||||
service.bestEffortAppend(ctx, operation.OperationEntry{
|
||||
GameID: fc.input.GameID,
|
||||
OpKind: operation.OpKindPatch,
|
||||
OpSource: fc.input.OpSource,
|
||||
SourceRef: fc.input.SourceRef,
|
||||
ImageRef: fc.imageRef,
|
||||
ContainerID: fc.containerID,
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
StartedAt: fc.opStartedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode)
|
||||
|
||||
logArgs := []any{
|
||||
"game_id", fc.input.GameID,
|
||||
"image_ref", fc.imageRef,
|
||||
"op_source", string(fc.input.OpSource),
|
||||
"error_code", fc.errorCode,
|
||||
"error_message", fc.errorMessage,
|
||||
}
|
||||
logArgs = append(logArgs, logging.ContextAttrs(ctx)...)
|
||||
service.logger.WarnContext(ctx, "runtime patch failed", logArgs...)
|
||||
|
||||
return Result{
|
||||
Outcome: operation.OutcomeFailure,
|
||||
ErrorCode: fc.errorCode,
|
||||
ErrorMessage: fc.errorMessage,
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) releaseLease(ctx context.Context, gameID, token string) {
|
||||
cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout)
|
||||
defer cancel()
|
||||
if err := service.leases.Release(cleanupCtx, gameID, token); err != nil {
|
||||
service.logger.WarnContext(ctx, "release game lease",
|
||||
"game_id", gameID,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) {
|
||||
if _, err := service.operationLogs.Append(ctx, entry); err != nil {
|
||||
service.logger.ErrorContext(ctx, "append operation log",
|
||||
"game_id", entry.GameID,
|
||||
"op_kind", string(entry.OpKind),
|
||||
"outcome", string(entry.Outcome),
|
||||
"error_code", entry.ErrorCode,
|
||||
"err", err.Error(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTokenGenerator() func() string {
|
||||
return func() string {
|
||||
var buf [32]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return "rtmanager-fallback-token"
|
||||
}
|
||||
return base64.RawURLEncoding.EncodeToString(buf[:])
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user