Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/ws-manager-mk2/controllers/status.go
2498 views
1
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
// Licensed under the GNU Affero General Public License (AGPL).
3
// See License-AGPL.txt in the project root for license information.
4
5
package controllers
6
7
import (
8
"bytes"
9
"context"
10
"encoding/json"
11
"fmt"
12
"strings"
13
"time"
14
15
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
16
"github.com/gitpod-io/gitpod/common-go/tracing"
17
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
18
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
19
"github.com/go-logr/logr"
20
"golang.org/x/xerrors"
21
corev1 "k8s.io/api/core/v1"
22
"k8s.io/apimachinery/pkg/api/errors"
23
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24
"k8s.io/apimachinery/pkg/types"
25
"sigs.k8s.io/controller-runtime/pkg/log"
26
)
27
28
const (
29
// containerKilledExitCode is the exit code Kubernetes uses for a container which was killed by the system.
30
// We expect such containers to be restarted by Kubernetes if they're supposed to be running.
31
// We never deliberately terminate a container like this.
32
containerKilledExitCode = 137
33
34
// containerUnknownExitCode is the exit code containerd uses if it cannot determine the cause/exit status of
35
// a stopped container.
36
containerUnknownExitCode = 255
37
38
// headlessTaskFailedPrefix is the prefix of the pod termination message if a headless task failed (e.g. user error
39
// or aborted prebuild).
40
headlessTaskFailedPrefix = "headless task failed: "
41
42
// podRejectedReasonNodeAffinity is the value of pod.status.Reason in case the pod got rejected by kubelet because of a NodeAffinity mismatch
43
podRejectedReasonNodeAffinity = "NodeAffinity"
44
45
// podRejectedReasonOutOfCPU is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient CPU available
46
podRejectedReasonOutOfCPU = "OutOfcpu"
47
48
// podRejectedReasonOutOfMemory is the value of pod.status.Reason in case the pod got rejected by kubelet because of insufficient memory available
49
podRejectedReasonOutOfMemory = "OutOfmemory"
50
)
51
52
func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspace *workspacev1.Workspace, pods *corev1.PodList, cfg *config.Configuration) (err error) {
53
span, ctx := tracing.FromContext(ctx, "updateWorkspaceStatus")
54
defer tracing.FinishSpan(span, &err)
55
log := log.FromContext(ctx).WithValues("owi", workspace.OWI())
56
ctx = logr.NewContext(ctx, log)
57
58
oldPhase := workspace.Status.Phase
59
defer func() {
60
if oldPhase != workspace.Status.Phase {
61
log.Info("workspace phase updated", "oldPhase", oldPhase, "phase", workspace.Status.Phase)
62
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping {
63
t := metav1.Now()
64
workspace.Status.PodStoppingTime = &t
65
}
66
}
67
}()
68
69
switch len(pods.Items) {
70
case 0:
71
if workspace.Status.Phase == "" {
72
workspace.Status.Phase = workspacev1.WorkspacePhasePending
73
}
74
75
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping && isDisposalFinished(workspace) {
76
workspace.Status.Phase = workspacev1.WorkspacePhaseStopped
77
}
78
79
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.Status.PodDeletionTime == nil {
80
// Set the timestamp when we first saw the pod as deleted.
81
// This is used for the delaying eventual pod restarts
82
podDeletionTime := metav1.NewTime(time.Now())
83
workspace.Status.PodDeletionTime = &podDeletionTime
84
}
85
86
workspace.UpsertConditionOnStatusChange(workspacev1.NewWorkspaceConditionContainerRunning(metav1.ConditionFalse))
87
return nil
88
case 1:
89
// continue below
90
default:
91
// This is exceptional - not sure what to do here. Probably fail the pod
92
workspace.Status.SetCondition(
93
workspacev1.NewWorkspaceConditionFailed("multiple pods exists - this should never happen"))
94
return nil
95
}
96
97
if c := wsk8s.GetCondition(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionDeployed)); c == nil {
98
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionDeployed())
99
}
100
101
pod := &pods.Items[0]
102
103
if workspace.Status.Runtime == nil {
104
workspace.Status.Runtime = &workspacev1.WorkspaceRuntimeStatus{}
105
}
106
if workspace.Status.Runtime.NodeName == "" && pod.Spec.NodeName != "" {
107
workspace.Status.Runtime.NodeName = pod.Spec.NodeName
108
}
109
if workspace.Status.Runtime.HostIP == "" && pod.Status.HostIP != "" {
110
workspace.Status.Runtime.HostIP = pod.Status.HostIP
111
}
112
if workspace.Status.Runtime.PodName == "" && pod.Name != "" {
113
workspace.Status.Runtime.PodName = pod.Name
114
}
115
116
workspace.Status.Runtime.PodIP = pod.Status.PodIP
117
118
// Check if the node has disappeared. If so, ws-daemon has also disappeared and we need to
119
// mark the workspace backup as failed if it didn't complete disposal yet.
120
// Otherwise, the workspace will be stuck in the Stopping phase forever.
121
if err := r.checkNodeDisappeared(ctx, workspace, pod); err != nil {
122
return err
123
}
124
125
if workspace.Status.URL == "" {
126
url, err := config.RenderWorkspaceURL(cfg.WorkspaceURLTemplate, workspace.Name, workspace.Spec.Ownership.WorkspaceID, cfg.GitpodHostURL)
127
if err != nil {
128
return xerrors.Errorf("cannot get workspace URL: %w", err)
129
}
130
workspace.Status.URL = url
131
}
132
133
if workspace.Status.OwnerToken == "" {
134
ownerToken, err := getRandomString(32)
135
if err != nil {
136
return xerrors.Errorf("cannot create owner token: %w", err)
137
}
138
workspace.Status.OwnerToken = ownerToken
139
}
140
141
failure, phase := r.extractFailure(ctx, workspace, pod)
142
if phase != nil {
143
workspace.Status.Phase = *phase
144
}
145
146
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
147
// Check: A situation where we want to retry?
148
if isPodRejected(pod) {
149
if !workspace.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {
150
// This is a situation where we want to re-create the pod!
151
log.Info("workspace got rejected", "workspace", workspace.Name, "reason", failure)
152
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
153
r.Recorder.Event(workspace, corev1.EventTypeWarning, "PodRejected", failure)
154
} else {
155
log.Info("workspace got rejected, but we don't handle it, because EveryReady=true", "workspace", workspace.Name, "reason", failure)
156
}
157
}
158
}
159
160
if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
161
var nodeName string
162
if workspace.Status.Runtime != nil {
163
nodeName = workspace.Status.Runtime.NodeName
164
}
165
// workspaces can fail only once - once there is a failed condition set, stick with it
166
log.Info("workspace failed", "workspace", workspace.Name, "node", nodeName, "reason", failure)
167
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionFailed(failure))
168
r.Recorder.Event(workspace, corev1.EventTypeWarning, "Failed", failure)
169
}
170
171
if workspace.IsHeadless() && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionsHeadlessTaskFailed) {
172
for _, cs := range pod.Status.ContainerStatuses {
173
if cs.State.Terminated != nil && cs.State.Terminated.Message != "" {
174
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionHeadlessTaskFailed(cs.State.Terminated.Message))
175
break
176
}
177
}
178
}
179
180
if isWorkspaceContainerRunning(pod.Status.ContainerStatuses) {
181
workspace.UpsertConditionOnStatusChange(workspacev1.NewWorkspaceConditionContainerRunning(metav1.ConditionTrue))
182
} else {
183
workspace.UpsertConditionOnStatusChange(workspacev1.NewWorkspaceConditionContainerRunning(metav1.ConditionFalse))
184
}
185
186
switch {
187
case isPodBeingDeleted(pod):
188
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping && isDisposalFinished(workspace) {
189
workspace.Status.Phase = workspacev1.WorkspacePhaseStopped
190
} else if workspace.Status.Phase != workspacev1.WorkspacePhaseStopped {
191
// Move to (or stay in) Stopping if not yet Stopped.
192
workspace.Status.Phase = workspacev1.WorkspacePhaseStopping
193
}
194
195
case pod.Status.Phase == corev1.PodPending:
196
var creating bool
197
// check if any container is still pulling images
198
for _, cs := range pod.Status.ContainerStatuses {
199
if cs.State.Waiting != nil {
200
switch cs.State.Waiting.Reason {
201
case "ContainerCreating", "ImagePullBackOff", "ErrImagePull":
202
creating = true
203
}
204
205
if creating {
206
break
207
}
208
}
209
}
210
if creating {
211
workspace.Status.Phase = workspacev1.WorkspacePhaseCreating
212
} else {
213
workspace.Status.Phase = workspacev1.WorkspacePhasePending
214
}
215
216
case pod.Status.Phase == corev1.PodRunning:
217
everReady := workspace.IsConditionTrue(workspacev1.WorkspaceConditionEverReady)
218
if everReady {
219
// If the workspace has been ready before, stay in a Running state, even
220
// if the workspace container is not ready anymore. This is to avoid the workspace
221
// moving back to Initializing and becoming unusable.
222
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
223
} else {
224
contentReady := workspace.IsConditionTrue(workspacev1.WorkspaceConditionContentReady)
225
var ideReady bool
226
for _, cs := range pod.Status.ContainerStatuses {
227
if cs.Ready {
228
ideReady = true
229
break
230
}
231
}
232
ready := contentReady && ideReady
233
234
if ready {
235
// workspace is ready - hence content init is done
236
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
237
if !workspace.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {
238
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionEverReady())
239
}
240
} else {
241
// workspace has not become ready yet - it must be initializing then.
242
workspace.Status.Phase = workspacev1.WorkspacePhaseInitializing
243
}
244
}
245
246
case workspace.IsHeadless() && (pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed):
247
if pod.Status.Phase == corev1.PodSucceeded && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {
248
// Fix for Prebuilds that instantly succeed (e.g. empty task), sometimes we don't observe the
249
// workspace `Running` phase for these, and never had the opportunity to add the EverReady condition.
250
// This would then cause a "start failure" in the metrics. So we retroactively add the EverReady
251
// condition here if the pod succeeded.
252
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionEverReady())
253
}
254
255
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping && isDisposalFinished(workspace) {
256
workspace.Status.Phase = workspacev1.WorkspacePhaseStopped
257
} else if workspace.Status.Phase != workspacev1.WorkspacePhaseStopped {
258
// Should be in Stopping phase, but isn't yet.
259
// Move to Stopping to start disposal, but only if maintenance mode is disabled.
260
if !r.maintenance.IsEnabled(ctx) {
261
workspace.Status.Phase = workspacev1.WorkspacePhaseStopping
262
}
263
}
264
265
case pod.Status.Phase == corev1.PodUnknown:
266
workspace.Status.Phase = workspacev1.WorkspacePhaseUnknown
267
268
default:
269
log.Info("cannot determine workspace phase", "podStatus", pod.Status)
270
workspace.Status.Phase = workspacev1.WorkspacePhaseUnknown
271
272
}
273
274
return nil
275
}
276
277
func (r *WorkspaceReconciler) checkNodeDisappeared(ctx context.Context, workspace *workspacev1.Workspace, pod *corev1.Pod) (err error) {
278
span, ctx := tracing.FromContext(ctx, "checkNodeDisappeared")
279
defer tracing.FinishSpan(span, &err)
280
281
if pod.Spec.NodeName == "" {
282
// Not yet scheduled.
283
return nil
284
}
285
286
var node corev1.Node
287
err = r.Get(ctx, types.NamespacedName{Namespace: "", Name: pod.Spec.NodeName}, &node)
288
if err == nil || !errors.IsNotFound(err) {
289
return err
290
}
291
292
// If NodeDisappeared is already set, return early, we've already made the below checks previously.
293
if workspace.IsConditionTrue(workspacev1.WorkspaceConditionNodeDisappeared) {
294
return nil
295
}
296
297
if !isDisposalFinished(workspace) {
298
// Node disappeared before a backup could be taken, mark it with a backup failure.
299
log.FromContext(ctx).Error(nil, "workspace node disappeared while disposal has not finished yet", "node", pod.Spec.NodeName)
300
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionBackupFailure("workspace node disappeared before backup was taken"))
301
}
302
303
// Must set this after checking isDisposalFinished, as that method also checks for the NodeDisappeared condition.
304
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionNodeDisappeared())
305
return nil
306
}
307
308
func isDisposalFinished(ws *workspacev1.Workspace) bool {
309
if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
310
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionStateWiped)); c != nil {
311
// If the condition is set, we are done with the disposal
312
return true
313
}
314
// If the condition has not yet been set, we are not done, yet.
315
return false
316
}
317
318
return ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete) ||
319
ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure) ||
320
ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) ||
321
// Nothing to dispose if content wasn't ready.
322
!ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady) ||
323
// Can't dispose if node disappeared.
324
ws.IsConditionTrue(workspacev1.WorkspaceConditionNodeDisappeared) ||
325
// Image builds have nothing to dispose.
326
ws.Spec.Type == workspacev1.WorkspaceTypeImageBuild
327
}
328
329
// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
330
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
331
// This failure is then stored in the Failed condition on the workspace.
332
func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev1.Workspace, pod *corev1.Pod) (string, *workspacev1.WorkspacePhase) {
333
// Check for content init failure.
334
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c != nil {
335
if c.Status == metav1.ConditionFalse && c.Reason == workspacev1.ReasonInitializationFailure {
336
msg := c.Message
337
if msg == "" {
338
msg = "Content initialization failed for an unknown reason"
339
} else {
340
msg = fmt.Sprintf("Content initialization failed: %s", msg)
341
}
342
return msg, nil
343
}
344
}
345
346
// Check for backup failure.
347
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)); c != nil {
348
msg := c.Message
349
if msg == "" {
350
msg = "Backup failed for an unknown reason"
351
} else {
352
msg = fmt.Sprintf("Backup failed: %s", msg)
353
}
354
return msg, nil
355
}
356
357
// Check for state wiping failure.
358
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionStateWiped)); c != nil && c.Status == metav1.ConditionFalse {
359
msg := c.Message
360
if msg == "" {
361
msg = "Wiping workspace state failed for an unknown reason"
362
} else {
363
msg = fmt.Sprintf("Wiping workspace state failed: %s", msg)
364
}
365
return msg, nil
366
}
367
368
status := pod.Status
369
if status.Phase == corev1.PodFailed && (status.Reason != "" || status.Message != "") {
370
// Don't force the phase to UNKNONWN here to leave a chance that we may detect the actual phase of
371
// the workspace, e.g. stopping.
372
return fmt.Sprintf("%s: %s", status.Reason, status.Message), nil
373
}
374
375
for _, cs := range status.ContainerStatuses {
376
if cs.State.Waiting != nil {
377
if cs.State.Waiting.Reason == "ImagePullBackOff" || cs.State.Waiting.Reason == "ErrImagePull" {
378
// If the image pull failed we were definitely in the api.WorkspacePhase_CREATING phase,
379
// unless of course this pod has been deleted already.
380
var res *workspacev1.WorkspacePhase
381
if isPodBeingDeleted(pod) {
382
// The pod is being deleted already and we have to decide the phase based on the presence of the
383
// finalizer and disposal status annotation. That code already exists in the remainder of getStatus,
384
// hence we defer the decision.
385
res = nil
386
} else {
387
c := workspacev1.WorkspacePhaseCreating
388
res = &c
389
}
390
return fmt.Sprintf("cannot pull image: %s", cs.State.Waiting.Message), res
391
}
392
}
393
394
terminationState := cs.State.Terminated
395
if terminationState == nil {
396
terminationState = cs.LastTerminationState.Terminated
397
}
398
if terminationState != nil {
399
// a workspace terminated container is not neccesarily bad. During shutdown workspaces containers
400
// can go in this state and that's ok. However, if the workspace was shutting down due to deletion,
401
// we would not be here as we've checked for a DeletionTimestamp prior. So let's find out why the
402
// container is terminating.
403
if terminationState.ExitCode != 0 && terminationState.Message != "" {
404
var phase *workspacev1.WorkspacePhase
405
if !isPodBeingDeleted(pod) {
406
// If the wrote a termination message and is not currently being deleted,
407
// then it must have been/be running. If we did not force the phase here,
408
// we'd be in unknown.
409
running := workspacev1.WorkspacePhaseRunning
410
phase = &running
411
}
412
413
if terminationState.ExitCode == containerKilledExitCode && terminationState.Reason == "ContainerStatusUnknown" {
414
// For some reason, the pod is killed with unknown container status and no taints on the underlying node.
415
// Therefore, we skip extracting the failure from the terminated message.
416
// ref: https://github.com/gitpod-io/gitpod/issues/12021
417
var node corev1.Node
418
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
419
if err := r.Get(ctx, types.NamespacedName{Namespace: "", Name: ws.Status.Runtime.NodeName}, &node); err == nil && len(node.Spec.Taints) == 0 {
420
return "", nil
421
}
422
}
423
}
424
425
if ws.IsHeadless() && strings.HasPrefix(terminationState.Message, headlessTaskFailedPrefix) {
426
// Headless task failed, not a workspace failure.
427
return "", nil
428
}
429
430
// the container itself told us why it was terminated - use that as failure reason
431
return extractFailureFromLogs([]byte(terminationState.Message)), phase
432
} else if terminationState.Reason == "Error" {
433
if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerKilledExitCode {
434
phase := workspacev1.WorkspacePhaseRunning
435
return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), &phase
436
}
437
} else if terminationState.Reason == "Completed" && !isPodBeingDeleted(pod) {
438
// Headless workspaces are expected to finish.
439
if !ws.IsHeadless() {
440
return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that", cs.Name), nil
441
}
442
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode == containerUnknownExitCode {
443
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), nil
444
} else if !isPodBeingDeleted(pod) {
445
// if a container is terminated and it wasn't because of either:
446
// - regular shutdown
447
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue
448
// - another known error
449
// then we report it as UNKNOWN
450
phase := workspacev1.WorkspacePhaseUnknown
451
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), &phase
452
}
453
}
454
}
455
456
return "", nil
457
}
458
459
func isWorkspaceContainerRunning(statuses []corev1.ContainerStatus) bool {
460
for _, cs := range statuses {
461
if cs.Name == "workspace" {
462
if cs.State.Running != nil {
463
return true
464
}
465
break
466
}
467
}
468
return false
469
}
470
471
// extractFailureFromLogs attempts to extract the last error message from a workspace
472
// container's log output.
473
func extractFailureFromLogs(logs []byte) string {
474
var sep = []byte("\n")
475
var msg struct {
476
Error string `json:"error"`
477
Message string `json:"message"`
478
}
479
480
var nidx int
481
for idx := bytes.LastIndex(logs, sep); idx > 0; idx = nidx {
482
nidx = bytes.LastIndex(logs[:idx], sep)
483
if nidx < 0 {
484
nidx = 0
485
}
486
487
line := logs[nidx:idx]
488
err := json.Unmarshal(line, &msg)
489
if err != nil {
490
continue
491
}
492
493
if msg.Message == "" {
494
continue
495
}
496
497
if msg.Error == "" {
498
return msg.Message
499
}
500
501
return msg.Message + ": " + msg.Error
502
}
503
504
return string(logs)
505
}
506
507
// isPodBeingDeleted returns true if the pod is currently being deleted
508
func isPodBeingDeleted(pod *corev1.Pod) bool {
509
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
510
return pod.ObjectMeta.DeletionTimestamp != nil
511
}
512
513
// isWorkspaceBeingDeleted returns true if the workspace resource is currently being deleted.
514
func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
515
return ws.ObjectMeta.DeletionTimestamp != nil
516
}
517
518
// isPodRejected returns true if the pod has been rejected by the kubelet
519
func isPodRejected(pod *corev1.Pod) bool {
520
return pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == podRejectedReasonNodeAffinity || pod.Status.Reason == podRejectedReasonOutOfCPU || pod.Status.Reason == podRejectedReasonOutOfMemory) && strings.HasPrefix(pod.Status.Message, "Pod was rejected")
521
}
522
523