Path: blob/main/components/ws-manager-mk2/controllers/timeout_controller.go
2498 views
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.1// Licensed under the GNU Affero General Public License (AGPL).2// See License-AGPL.txt in the project root for license information.34package controllers56import (7"context"8"fmt"9"time"1011corev1 "k8s.io/api/core/v1"12apierrors "k8s.io/apimachinery/pkg/api/errors"13"k8s.io/apimachinery/pkg/types"14"k8s.io/client-go/tools/record"15"k8s.io/client-go/util/retry"16ctrl "sigs.k8s.io/controller-runtime"17"sigs.k8s.io/controller-runtime/pkg/client"18"sigs.k8s.io/controller-runtime/pkg/controller"19"sigs.k8s.io/controller-runtime/pkg/log"20"sigs.k8s.io/controller-runtime/pkg/predicate"2122k8s "github.com/gitpod-io/gitpod/common-go/kubernetes"23"github.com/gitpod-io/gitpod/common-go/util"24"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"25"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/constants"26"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"27config "github.com/gitpod-io/gitpod/ws-manager/api/config"28workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"29"github.com/go-logr/logr"30)3132func NewTimeoutReconciler(c client.Client, recorder record.EventRecorder, cfg config.Configuration, maintenance maintenance.Maintenance) (*TimeoutReconciler, error) {33if cfg.HeartbeatInterval == 0 {34return nil, fmt.Errorf("invalid heartbeat interval, must not be 0")35}36reconcileInterval := time.Duration(cfg.HeartbeatInterval)37// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.38// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.39reconcileInterval /= 24041return &TimeoutReconciler{42Client: c,43Config: cfg,44reconcileInterval: reconcileInterval,45recorder: recorder,46maintenance: maintenance,47}, nil48}4950// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it51// always requeues events for existing workspaces such that timeouts are checked on (at least)52// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly53// reconciling all workspaces in the cluster.54type TimeoutReconciler struct {55client.Client5657Config config.Configuration58reconcileInterval time.Duration59recorder record.EventRecorder60maintenance maintenance.Maintenance61}6263//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete64//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch6566// Reconcile will check the given workspace for timing out. When done, a new event gets67// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.68func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {69log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)7071var workspace workspacev1.Workspace72if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {73if !apierrors.IsNotFound(err) {74log.Error(err, "unable to fetch workspace")75}76// We'll ignore not-found errors, since they can't be fixed by an immediate77// requeue (we'll need to wait for a new notification), and we can get them78// on deleted requests.79// On any other error, let the controller requeue an event with exponential80// backoff.81return ctrl.Result{}, client.IgnoreNotFound(err)82}83log = log.WithValues("owi", workspace.OWI())84ctx = logr.NewContext(ctx, log)8586if workspace.IsConditionTrue(workspacev1.WorkspaceConditionTimeout) {87// Workspace has already been marked as timed out.88// Return and don't requeue another reconciliation.89return ctrl.Result{}, nil90}9192if r.maintenance.IsEnabled(ctx) {93// Don't reconcile timeouts in maintenance mode, to prevent workspace deletion.94// Requeue after some time to ensure we do still reconcile this workspace when95// maintenance mode ends.96return ctrl.Result{RequeueAfter: maintenanceRequeue}, nil97}9899// The workspace hasn't timed out yet. After this point, we always100// want to requeue a reconciliation after the configured interval.101defer func() {102result.RequeueAfter = r.reconcileInterval103}()104105timedout := r.isWorkspaceTimedOut(&workspace)106if timedout == "" {107// Hasn't timed out.108return ctrl.Result{}, nil109}110111// Workspace timed out, set Timeout condition.112log.V(2).Info("Workspace timed out", "reason", timedout)113if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {114err := r.Get(ctx, types.NamespacedName{Name: workspace.Name, Namespace: workspace.Namespace}, &workspace)115if err != nil {116return err117}118119workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionTimeout(timedout))120return r.Status().Update(ctx, &workspace)121}); err != nil {122log.Error(err, "Failed to update workspace status with Timeout condition")123return ctrl.Result{}, fmt.Errorf("failed to add timeout condition: %w", err)124}125126r.recorder.Event(&workspace, corev1.EventTypeNormal, "TimedOut", timedout)127return ctrl.Result{}, nil128}129130type timeoutActivity string131132const (133activityInit timeoutActivity = "initialization"134activityStartup timeoutActivity = "startup"135activityCreatingContainers timeoutActivity = "creating containers"136activityPullingImages timeoutActivity = "pulling images"137activityRunningHeadless timeoutActivity = "running the headless workspace"138activityNone timeoutActivity = "period of inactivity"139activityMaxLifetime timeoutActivity = "maximum lifetime"140activityClosed timeoutActivity = "after being closed"141activityInterrupted timeoutActivity = "workspace interruption"142activityStopping timeoutActivity = "stopping"143activityBackup timeoutActivity = "backup"144)145146// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.147// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.148func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string) {149timeouts := r.Config.Timeouts150phase := ws.Status.Phase151152decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) string {153td := time.Duration(timeout)154inactivity := time.Since(start)155if inactivity < td {156return ""157}158159return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td))160}161162start := ws.ObjectMeta.CreationTimestamp.Time163lastActivity := activity.Last(ws)164isClosed := ws.IsConditionTrue(workspacev1.WorkspaceConditionClosed)165166switch phase {167case workspacev1.WorkspacePhasePending:168return decide(start, timeouts.Initialization, activityInit)169170case workspacev1.WorkspacePhaseInitializing:171return decide(start, timeouts.TotalStartup, activityStartup)172173case workspacev1.WorkspacePhaseCreating:174activity := activityCreatingContainers175// TODO:176// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {177// activity = activityPullingImages178// }179return decide(start, timeouts.TotalStartup, activity)180181case workspacev1.WorkspacePhaseRunning:182// First check is always for the max lifetime183maxLifetime := r.getMaxLifetime(ws)184if msg := decide(start, maxLifetime, activityMaxLifetime); msg != "" {185return msg186}187188timeout := timeouts.RegularWorkspace189if customTimeout := ws.Spec.Timeout.Time; customTimeout != nil {190timeout = util.Duration(customTimeout.Duration)191}192activity := activityNone193if ws.IsHeadless() {194timeout = timeouts.HeadlessWorkspace195lastActivity = &start196activity = activityRunningHeadless197} else if lastActivity == nil {198// The workspace is up and running, but the user has never produced any activity199return decide(start, timeouts.TotalStartup, activityNone)200} else if isClosed {201reason := func() string {202afterClosed := timeouts.AfterClose203if customClosedTimeout := ws.Spec.Timeout.ClosedTimeout; customClosedTimeout != nil {204afterClosed = util.Duration(customClosedTimeout.Duration)205if afterClosed == 0 {206return ""207}208}209return decide(*lastActivity, afterClosed, activityClosed)210}()211if reason != "" {212return reason213}214}215return decide(*lastActivity, timeout, activity)216217case workspacev1.WorkspacePhaseStopping:218if isWorkspaceBeingDeleted(ws) && !ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete) {219// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.220// We basically don't expect a workspace to be in content finalization before it's been deleted.221return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)222} else if !isWorkspaceBeingDeleted(ws) {223// workspaces that have not been deleted have never timed out224return ""225} else {226return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)227}228229default:230// The only other phases we can be in is stopped which is pointless to time out231return ""232}233}234235func (r *TimeoutReconciler) getMaxLifetime(ws *workspacev1.Workspace) util.Duration {236if ws.Spec.Timeout.MaximumLifetime != nil {237return util.Duration(ws.Spec.Timeout.MaximumLifetime.Duration)238}239240return r.Config.Timeouts.MaxLifetime241}242243func formatDuration(d time.Duration) string {244d = d.Round(time.Minute)245h := d / time.Hour246d -= h * time.Hour247m := d / time.Minute248return fmt.Sprintf("%02dh%02dm", h, m)249}250251// SetupWithManager sets up the controller with the Manager.252func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {253maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles254if maxConcurrentReconciles <= 0 {255maxConcurrentReconciles = 1256}257258return ctrl.NewControllerManagedBy(mgr).259Named("timeout").260WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).261For(&workspacev1.Workspace{}).262WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool {263for k, v := range object.GetLabels() {264if k == k8s.WorkspaceManagedByLabel {265switch v {266case constants.ManagedBy:267return true268default:269return false270}271}272}273274return true275})).276Complete(r)277}278279280