// Package patchruntime implements the `patch` lifecycle operation owned // by Runtime Manager. Patch is restart with a new `image_ref`: under // one outer per-game lease the service runs the stop service, removes // the container, and runs the start service with the new image. The // engine reads its state from the bind-mount on startup, so any data // written before the patch survives. // // The new and current image references must both parse as semver tags // and share their major and minor components. A new tag that bumps the // major or the minor surfaces as `semver_patch_only`; a tag that is // not parseable as semver surfaces as `image_ref_not_semver`. These // pre-checks run before any Docker work so a rejected patch never // disturbs the running runtime. // // Lifecycle and failure-mode semantics follow `rtmanager/README.md // §Lifecycles → Patch`. Design rationale is captured in // `rtmanager/docs/services.md`. package patchruntime import ( "context" "crypto/rand" "encoding/base64" "errors" "fmt" "log/slog" "strings" "time" "galaxy/rtmanager/internal/config" "galaxy/rtmanager/internal/domain/operation" "galaxy/rtmanager/internal/domain/runtime" "galaxy/rtmanager/internal/logging" "galaxy/rtmanager/internal/ports" "galaxy/rtmanager/internal/service/startruntime" "galaxy/rtmanager/internal/service/stopruntime" "galaxy/rtmanager/internal/telemetry" ) const leaseReleaseTimeout = 5 * time.Second // Input stores the per-call arguments for one patch operation. type Input struct { // GameID identifies the platform game to patch. GameID string // NewImageRef stores the new Docker reference the patch installs. // Must be a valid Docker reference whose tag parses as semver. NewImageRef string // OpSource classifies how the request entered Runtime Manager. OpSource operation.OpSource // SourceRef stores the optional opaque per-source reference. When // non-empty it is reused as the correlation id linking the outer // patch entry to the inner stop and start log entries. SourceRef string } // Validate reports whether input carries the structural invariants the // service requires. Image-reference shape and semver checks happen // later inside Handle so that they run after the runtime record has // been loaded. func (input Input) Validate() error { if strings.TrimSpace(input.GameID) == "" { return fmt.Errorf("game id must not be empty") } if strings.TrimSpace(input.NewImageRef) == "" { return fmt.Errorf("new image ref must not be empty") } if !input.OpSource.IsKnown() { return fmt.Errorf("op source %q is unsupported", input.OpSource) } return nil } // Result stores the deterministic outcome of one Handle call. type Result struct { // Record carries the runtime record installed by the inner start on // success; zero on failure. Record runtime.RuntimeRecord // Outcome reports whether the operation completed (success) or // produced a stable failure code. Outcome operation.Outcome // ErrorCode stores the stable error code on failure. ErrorCode string // ErrorMessage stores the operator-readable detail on failure. ErrorMessage string } // Dependencies groups the collaborators required by Service. type Dependencies struct { RuntimeRecords ports.RuntimeRecordStore OperationLogs ports.OperationLogStore Docker ports.DockerClient Leases ports.GameLeaseStore // StopService runs the inner stop step. StopService *stopruntime.Service // StartService runs the inner start step with the new image_ref. StartService *startruntime.Service Coordination config.CoordinationConfig Telemetry *telemetry.Runtime Logger *slog.Logger Clock func() time.Time NewToken func() string } // Service executes the patch lifecycle operation. type Service struct { runtimeRecords ports.RuntimeRecordStore operationLogs ports.OperationLogStore docker ports.DockerClient leases ports.GameLeaseStore stopService *stopruntime.Service startService *startruntime.Service leaseTTL time.Duration telemetry *telemetry.Runtime logger *slog.Logger clock func() time.Time newToken func() string } // NewService constructs one Service from deps. func NewService(deps Dependencies) (*Service, error) { switch { case deps.RuntimeRecords == nil: return nil, errors.New("new patch runtime service: nil runtime records") case deps.OperationLogs == nil: return nil, errors.New("new patch runtime service: nil operation logs") case deps.Docker == nil: return nil, errors.New("new patch runtime service: nil docker client") case deps.Leases == nil: return nil, errors.New("new patch runtime service: nil lease store") case deps.StopService == nil: return nil, errors.New("new patch runtime service: nil stop service") case deps.StartService == nil: return nil, errors.New("new patch runtime service: nil start service") case deps.Telemetry == nil: return nil, errors.New("new patch runtime service: nil telemetry runtime") } if err := deps.Coordination.Validate(); err != nil { return nil, fmt.Errorf("new patch runtime service: coordination config: %w", err) } clock := deps.Clock if clock == nil { clock = time.Now } logger := deps.Logger if logger == nil { logger = slog.Default() } logger = logger.With("service", "rtmanager.patchruntime") newToken := deps.NewToken if newToken == nil { newToken = defaultTokenGenerator() } return &Service{ runtimeRecords: deps.RuntimeRecords, operationLogs: deps.OperationLogs, docker: deps.Docker, leases: deps.Leases, stopService: deps.StopService, startService: deps.StartService, leaseTTL: deps.Coordination.GameLeaseTTL, telemetry: deps.Telemetry, logger: logger, clock: clock, newToken: newToken, }, nil } // Handle executes one patch operation end-to-end. The Go-level error // return is reserved for non-business failures (nil context, nil // receiver). Every business outcome — success or any of the stable // failure codes — flows through Result. func (service *Service) Handle(ctx context.Context, input Input) (Result, error) { if service == nil { return Result{}, errors.New("patch runtime: nil service") } if ctx == nil { return Result{}, errors.New("patch runtime: nil context") } opStartedAt := service.clock().UTC() if err := input.Validate(); err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInvalidRequest, errorMessage: err.Error(), }), nil } token := service.newToken() leaseStart := service.clock() acquired, err := service.leases.TryAcquire(ctx, input.GameID, token, service.leaseTTL) service.telemetry.RecordLeaseAcquireLatency(ctx, service.clock().Sub(leaseStart)) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeServiceUnavailable, errorMessage: fmt.Sprintf("acquire game lease: %s", err.Error()), }), nil } if !acquired { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeConflict, errorMessage: "another lifecycle operation is in progress for this game", }), nil } defer service.releaseLease(ctx, input.GameID, token) return service.runUnderLease(ctx, input, opStartedAt) } // runUnderLease executes the lease-protected patch sequence: load the // runtime record, validate semver compatibility, run inner stop, // remove the container, run inner start with the new image. func (service *Service) runUnderLease(ctx context.Context, input Input, opStartedAt time.Time) (Result, error) { existing, err := service.runtimeRecords.Get(ctx, input.GameID) if errors.Is(err, runtime.ErrNotFound) { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeNotFound, errorMessage: fmt.Sprintf("runtime record for game %q does not exist", input.GameID), }), nil } if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("load runtime record: %s", err.Error()), }), nil } if existing.Status == runtime.StatusRemoved { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeConflict, errorMessage: fmt.Sprintf("runtime for game %q is removed; cannot patch", input.GameID), }), nil } if strings.TrimSpace(existing.CurrentImageRef) == "" { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("runtime record for game %q has no current image_ref", input.GameID), }), nil } currentSemver, err := extractSemverTag(existing.CurrentImageRef) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeImageRefNotSemver, errorMessage: fmt.Sprintf("current image_ref: %s", err.Error()), imageRef: existing.CurrentImageRef, }), nil } newSemver, err := extractSemverTag(input.NewImageRef) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeImageRefNotSemver, errorMessage: fmt.Sprintf("new image_ref: %s", err.Error()), imageRef: input.NewImageRef, }), nil } if !samePatchSeries(currentSemver, newSemver) { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeSemverPatchOnly, errorMessage: fmt.Sprintf( "patch must keep major.minor; current=%s new=%s", currentSemver, newSemver, ), imageRef: input.NewImageRef, }), nil } correlationRef := input.SourceRef if correlationRef == "" { correlationRef = service.newToken() } containerID := existing.CurrentContainerID stopResult, err := service.stopService.Run(ctx, stopruntime.Input{ GameID: input.GameID, Reason: stopruntime.StopReasonAdminRequest, OpSource: input.OpSource, SourceRef: correlationRef, }) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("inner stop: %s", err.Error()), imageRef: input.NewImageRef, containerID: containerID, }), nil } if stopResult.Outcome == operation.OutcomeFailure { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: stopResult.ErrorCode, errorMessage: fmt.Sprintf("inner stop failed: %s", stopResult.ErrorMessage), imageRef: input.NewImageRef, containerID: containerID, }), nil } if containerID != "" { if err := service.docker.Remove(ctx, containerID); err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeServiceUnavailable, errorMessage: fmt.Sprintf("docker remove: %s", err.Error()), imageRef: input.NewImageRef, containerID: containerID, }), nil } } startResult, err := service.startService.Run(ctx, startruntime.Input{ GameID: input.GameID, ImageRef: input.NewImageRef, OpSource: input.OpSource, SourceRef: correlationRef, }) if err != nil { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startruntime.ErrorCodeInternal, errorMessage: fmt.Sprintf("inner start: %s", err.Error()), imageRef: input.NewImageRef, }), nil } if startResult.Outcome == operation.OutcomeFailure { return service.recordFailure(ctx, failureCtx{ opStartedAt: opStartedAt, input: input, errorCode: startResult.ErrorCode, errorMessage: fmt.Sprintf("inner start failed: %s", startResult.ErrorMessage), imageRef: input.NewImageRef, }), nil } finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: input.GameID, OpKind: operation.OpKindPatch, OpSource: input.OpSource, SourceRef: correlationRef, ImageRef: input.NewImageRef, ContainerID: startResult.Record.CurrentContainerID, Outcome: operation.OutcomeSuccess, StartedAt: opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeSuccess), "") logArgs := []any{ "game_id", input.GameID, "prev_image_ref", existing.CurrentImageRef, "new_image_ref", input.NewImageRef, "prev_container_id", containerID, "new_container_id", startResult.Record.CurrentContainerID, "op_source", string(input.OpSource), } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.InfoContext(ctx, "runtime patched", logArgs...) return Result{ Record: startResult.Record, Outcome: operation.OutcomeSuccess, }, nil } // failureCtx groups the inputs to recordFailure. type failureCtx struct { opStartedAt time.Time input Input errorCode string errorMessage string imageRef string containerID string } // recordFailure writes the outer failure operation_log entry and emits // telemetry. Inner stop / start services have already recorded their // own entries; this is the outer summary. func (service *Service) recordFailure(ctx context.Context, fc failureCtx) Result { finishedAt := service.clock().UTC() service.bestEffortAppend(ctx, operation.OperationEntry{ GameID: fc.input.GameID, OpKind: operation.OpKindPatch, OpSource: fc.input.OpSource, SourceRef: fc.input.SourceRef, ImageRef: fc.imageRef, ContainerID: fc.containerID, Outcome: operation.OutcomeFailure, ErrorCode: fc.errorCode, ErrorMessage: fc.errorMessage, StartedAt: fc.opStartedAt, FinishedAt: &finishedAt, }) service.telemetry.RecordPatchOutcome(ctx, string(operation.OutcomeFailure), fc.errorCode) logArgs := []any{ "game_id", fc.input.GameID, "image_ref", fc.imageRef, "op_source", string(fc.input.OpSource), "error_code", fc.errorCode, "error_message", fc.errorMessage, } logArgs = append(logArgs, logging.ContextAttrs(ctx)...) service.logger.WarnContext(ctx, "runtime patch failed", logArgs...) return Result{ Outcome: operation.OutcomeFailure, ErrorCode: fc.errorCode, ErrorMessage: fc.errorMessage, } } func (service *Service) releaseLease(ctx context.Context, gameID, token string) { cleanupCtx, cancel := context.WithTimeout(context.Background(), leaseReleaseTimeout) defer cancel() if err := service.leases.Release(cleanupCtx, gameID, token); err != nil { service.logger.WarnContext(ctx, "release game lease", "game_id", gameID, "err", err.Error(), ) } } func (service *Service) bestEffortAppend(ctx context.Context, entry operation.OperationEntry) { if _, err := service.operationLogs.Append(ctx, entry); err != nil { service.logger.ErrorContext(ctx, "append operation log", "game_id", entry.GameID, "op_kind", string(entry.OpKind), "outcome", string(entry.Outcome), "error_code", entry.ErrorCode, "err", err.Error(), ) } } func defaultTokenGenerator() func() string { return func() string { var buf [32]byte if _, err := rand.Read(buf[:]); err != nil { return "rtmanager-fallback-token" } return base64.RawURLEncoding.EncodeToString(buf[:]) } }