Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/ws-daemon/pkg/daemon/markunmount.go
2501 views
1
// Copyright (c) 2021 Gitpod GmbH. All rights reserved.
2
// Licensed under the GNU Affero General Public License (AGPL).
3
// See License.AGPL.txt in the project root for license information.
4
5
package daemon
6
7
import (
8
"bufio"
9
"bytes"
10
"context"
11
"errors"
12
"io/ioutil"
13
"path/filepath"
14
"strings"
15
"sync"
16
"time"
17
18
"golang.org/x/sync/errgroup"
19
"golang.org/x/sys/unix"
20
"golang.org/x/xerrors"
21
"k8s.io/apimachinery/pkg/util/wait"
22
"k8s.io/client-go/util/retry"
23
24
"github.com/gitpod-io/gitpod/common-go/log"
25
"github.com/gitpod-io/gitpod/ws-daemon/pkg/dispatch"
26
"github.com/prometheus/client_golang/prometheus"
27
)
28
29
const (
30
// propagationGracePeriod is the time we allow on top of a container's deletionGracePeriod
31
// to make sure the changes propagate on the data plane.
32
propagationGracePeriod = 10 * time.Second
33
)
34
35
// NewMarkUnmountFallback produces a new MarkUnmountFallback. reg can be nil
36
func NewMarkUnmountFallback(reg prometheus.Registerer) (*MarkUnmountFallback, error) {
37
counter := prometheus.NewCounterVec(prometheus.CounterOpts{
38
Name: "markunmountfallback_active_total",
39
Help: "counts how often the mark unmount fallback was active",
40
}, []string{"successful"})
41
if reg != nil {
42
err := reg.Register(counter)
43
if err != nil {
44
return nil, err
45
}
46
}
47
48
return &MarkUnmountFallback{
49
activityCounter: counter,
50
}, nil
51
}
52
53
// MarkUnmountFallback works around the mount propagation of the ring1 FS mark mount.
54
// When ws-daemon restarts runc propagates all rootfs mounts to ws-daemon's mount namespace.
55
// This prevents proper unmounting of the mark mount, hence the rootfs of the workspace container.
56
//
57
// To work around this issue we wait pod.terminationGracePeriod + propagationGracePeriod and,
58
// after which we attempt to unmount the mark mount.
59
//
60
// Some clusters might run an older version of containerd, for which we build this workaround.
61
type MarkUnmountFallback struct {
62
mu sync.Mutex
63
handled map[string]struct{}
64
65
activityCounter *prometheus.CounterVec
66
}
67
68
// WorkspaceAdded does nothing but implemented the dispatch.Listener interface
69
func (c *MarkUnmountFallback) WorkspaceAdded(ctx context.Context, ws *dispatch.Workspace) error {
70
return nil
71
}
72
73
// WorkspaceUpdated gets called when a workspace pod is updated. For containers being deleted, we'll check
74
// if they're still running after their terminationGracePeriod and if Kubernetes still knows about them.
75
func (c *MarkUnmountFallback) WorkspaceUpdated(ctx context.Context, ws *dispatch.Workspace) error {
76
if ws.Pod.DeletionTimestamp == nil {
77
return nil
78
}
79
80
err := func() error {
81
c.mu.Lock()
82
defer c.mu.Unlock()
83
84
if c.handled == nil {
85
c.handled = make(map[string]struct{})
86
}
87
if _, exists := c.handled[ws.InstanceID]; exists {
88
return nil
89
}
90
c.handled[ws.InstanceID] = struct{}{}
91
return nil
92
}()
93
if err != nil {
94
return err
95
}
96
97
var gracePeriod int64
98
if ws.Pod.DeletionGracePeriodSeconds != nil {
99
gracePeriod = *ws.Pod.DeletionGracePeriodSeconds
100
} else {
101
gracePeriod = 30
102
}
103
ttl := time.Duration(gracePeriod)*time.Second + propagationGracePeriod
104
105
dispatch.GetDispatchWaitGroup(ctx).Add(1)
106
go func() {
107
defer dispatch.GetDispatchWaitGroup(ctx).Done()
108
109
defer func() {
110
// We expect the container to be gone now. Don't keep its referenec in memory.
111
c.mu.Lock()
112
delete(c.handled, ws.InstanceID)
113
c.mu.Unlock()
114
}()
115
116
wait := time.NewTicker(ttl)
117
defer wait.Stop()
118
select {
119
case <-ctx.Done():
120
return
121
case <-wait.C:
122
}
123
124
dsp := dispatch.GetFromContext(ctx)
125
if !dsp.WorkspaceExistsOnNode(ws.InstanceID) {
126
// container is already gone - all is well
127
return
128
}
129
130
err := unmountMark(ws.InstanceID)
131
if err != nil && errors.Is(err, context.Canceled) {
132
log.WithFields(ws.OWI()).WithError(err).Error("cannot unmount mark mount from within ws-daemon")
133
c.activityCounter.WithLabelValues("false").Inc()
134
} else {
135
c.activityCounter.WithLabelValues("true").Inc()
136
}
137
}()
138
139
return nil
140
}
141
142
// if the mark mount still exists in /proc/mounts it means we failed to unmount it and
143
// we cannot remove the content. As a side effect the pod will stay in Terminating state
144
func unmountMark(instanceID string) error {
145
mounts, err := ioutil.ReadFile("/proc/mounts")
146
if err != nil {
147
return xerrors.Errorf("cannot read /proc/mounts: %w", err)
148
}
149
150
dir := instanceID + "-daemon"
151
path := fromPartialMount(filepath.Join(dir, "mark"), mounts)
152
// empty path means no mount found
153
if len(path) == 0 {
154
return nil
155
}
156
157
// in some scenarios we need to wait for the unmount
158
var canRetryFn = func(err error) bool {
159
if !strings.Contains(err.Error(), "device or resource busy") {
160
log.WithError(err).WithFields(log.OWI("", "", instanceID)).Info("Will not retry unmount mark")
161
}
162
return strings.Contains(err.Error(), "device or resource busy")
163
}
164
165
var eg errgroup.Group
166
for _, p := range path {
167
// add p as closure so that we can use it inside the Go routine.
168
p := p
169
eg.Go(func() error {
170
return retry.OnError(wait.Backoff{
171
Steps: 5,
172
Duration: 1 * time.Second,
173
Factor: 5.0,
174
Jitter: 0.1,
175
}, canRetryFn, func() error {
176
return unix.Unmount(p, 0)
177
})
178
})
179
}
180
return eg.Wait()
181
}
182
183
func fromPartialMount(path string, info []byte) (res []string) {
184
scanner := bufio.NewScanner(bytes.NewReader(info))
185
for scanner.Scan() {
186
mount := strings.Split(scanner.Text(), " ")
187
if len(mount) < 2 {
188
continue
189
}
190
191
if strings.Contains(mount[1], path) {
192
res = append(res, mount[1])
193
}
194
}
195
196
return res
197
}
198
199