Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/workspacekit/pkg/seccomp/notify.go
2500 views
1
// Copyright (c) 2021 Gitpod GmbH. All rights reserved.
2
// Licensed under the GNU Affero General Public License (AGPL).
3
// See License.AGPL.txt in the project root for license information.
4
5
package seccomp
6
7
import (
8
"context"
9
"errors"
10
"fmt"
11
"io"
12
"io/fs"
13
"os"
14
"path/filepath"
15
"strconv"
16
"strings"
17
"syscall"
18
"time"
19
20
"github.com/moby/sys/mountinfo"
21
"golang.org/x/sys/unix"
22
"golang.org/x/xerrors"
23
24
"github.com/gitpod-io/gitpod/common-go/log"
25
"github.com/gitpod-io/gitpod/workspacekit/pkg/readarg"
26
daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api"
27
libseccomp "github.com/seccomp/libseccomp-golang"
28
)
29
30
type syscallHandler func(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)
31
32
// SyscallHandler handles seccomp syscall notifications
33
type SyscallHandler interface {
34
Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)
35
Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)
36
Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)
37
Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)
38
}
39
40
func mapHandler(h SyscallHandler) map[string]syscallHandler {
41
return map[string]syscallHandler{
42
"mount": h.Mount,
43
"umount": h.Umount,
44
"umount2": h.Umount,
45
"bind": h.Bind,
46
"chown": h.Chown,
47
}
48
}
49
50
// LoadFilter loads the syscall filter required to make the handler work.
51
// Calling this function has a range of side-effects:
52
// - we'll lock the caller using `runtime.LockOSThread()`
53
// - we'll set no_new_privs on the process
54
func LoadFilter() (libseccomp.ScmpFd, error) {
55
filter, err := libseccomp.NewFilter(libseccomp.ActAllow)
56
if err != nil {
57
return 0, xerrors.Errorf("cannot create filter: %w", err)
58
}
59
err = filter.SetTsync(false)
60
if err != nil {
61
return 0, xerrors.Errorf("cannot set tsync: %w", err)
62
}
63
err = filter.SetNoNewPrivsBit(false)
64
if err != nil {
65
return 0, xerrors.Errorf("cannot set no_new_privs: %w", err)
66
}
67
68
// we explicitly prohibit open_tree/move_mount to prevent container workloads
69
// from moving a proc mask using open_tree(..., CLONE|RECURSIVE).
70
deniedSyscalls := []string{
71
"open_tree",
72
"move_mount",
73
}
74
for _, sc := range deniedSyscalls {
75
syscallID, err := libseccomp.GetSyscallFromName(sc)
76
if err != nil {
77
return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err)
78
}
79
err = filter.AddRule(syscallID, libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)))
80
if err != nil {
81
return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err)
82
}
83
}
84
85
handledSyscalls := mapHandler(&InWorkspaceHandler{})
86
for sc := range handledSyscalls {
87
syscallID, err := libseccomp.GetSyscallFromName(sc)
88
if err != nil {
89
return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err)
90
}
91
err = filter.AddRule(syscallID, libseccomp.ActNotify)
92
if err != nil {
93
return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err)
94
}
95
}
96
97
err = filter.Load()
98
if err != nil {
99
return 0, xerrors.Errorf("cannot load filter: %w", err)
100
}
101
102
fd, err := filter.GetNotifFd()
103
if err != nil {
104
return 0, xerrors.Errorf("cannot get inotif fd: %w", err)
105
}
106
107
return fd, nil
108
}
109
110
const (
111
// IWS backoff is the backoff configuration we use for interacting with the in-workspace service
112
iwsBackoffInitialWait = 10 * time.Millisecond
113
iwsBackoffSteps = 6
114
iwsBackoffFactor = 5
115
iwsBackoffMaxWait = 2500 * time.Millisecond
116
)
117
118
// Handle actually listens on the seccomp notif FD and handles incoming requests.
119
// This function returns when the notif FD is closed.
120
func Handle(fd libseccomp.ScmpFd, handler SyscallHandler, wsid string) (stop chan<- struct{}, errchan <-chan error) {
121
log := log.WithField("workspaceId", wsid)
122
123
ec := make(chan error)
124
stp := make(chan struct{})
125
126
handledSyscalls := mapHandler(handler)
127
go func() {
128
for {
129
req, err := libseccomp.NotifReceive(fd)
130
if err != nil {
131
if err == syscall.ENOENT {
132
log.WithError(err).Warn("failed to get notification beucase it has already been not valid anymore(the kernel sets that)")
133
continue
134
}
135
136
log.WithError(err).Error("failed to get notification")
137
ec <- err
138
if err == unix.ECANCELED {
139
return
140
}
141
142
continue
143
}
144
select {
145
case <-stp:
146
// if we're asked stop we might still have to answer a syscall.
147
// We do this on a best effort basis answering with EPERM.
148
if err != nil {
149
_ = libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{
150
ID: req.ID,
151
Error: 1,
152
Val: 0,
153
Flags: 0,
154
})
155
}
156
default:
157
}
158
159
go func() {
160
syscallName, _ := req.Data.Syscall.GetName()
161
162
handler, ok := handledSyscalls[syscallName]
163
if !ok {
164
handler = handleUnknownSyscall
165
}
166
val, errno, flags := handler(req)
167
168
ierr := libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{
169
ID: req.ID,
170
Error: errno,
171
Val: val,
172
Flags: flags,
173
})
174
if ierr != nil {
175
log.WithError(ierr).Error("failed to return notification response")
176
ec <- ierr
177
}
178
}()
179
}
180
}()
181
182
return stp, ec
183
}
184
185
func handleUnknownSyscall(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {
186
nme, _ := req.Data.Syscall.GetName()
187
log.WithField("syscall", nme).Warn("don't know how to handle this syscall")
188
return 0, 1, 0
189
}
190
191
func Errno(err unix.Errno) (val uint64, errno int32, flags uint32) {
192
return ^uint64(0), int32(errno), 0
193
}
194
195
// IWSClientProvider provides a client to the in-workspace-service.
196
// Consumers of this provider will close the client after use.
197
type IWSClientProvider func(ctx context.Context) (InWorkspaceServiceClient, error)
198
199
type InWorkspaceServiceClient interface {
200
daemonapi.InWorkspaceServiceClient
201
io.Closer
202
}
203
204
// InWorkspaceHandler is the seccomp notification handler that serves a Gitpod workspace
205
type InWorkspaceHandler struct {
206
FD libseccomp.ScmpFd
207
Daemon IWSClientProvider
208
Ring2PID int
209
Ring2Rootfs string
210
BindEvents chan<- BindEvent
211
WorkspaceId string
212
}
213
214
// BindEvent describes a process binding to a socket
215
type BindEvent struct {
216
PID uint32
217
}
218
219
// Mount handles mount syscalls
220
func (h *InWorkspaceHandler) Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {
221
log := log.WithFields(map[string]interface{}{
222
"syscall": "mount",
223
log.WorkspaceIDField: h.WorkspaceId,
224
"pid": req.Pid,
225
"id": req.ID,
226
})
227
228
memFile, err := readarg.OpenMem(req.Pid)
229
if err != nil {
230
log.WithError(err).Error("cannot open mem")
231
return Errno(unix.EPERM)
232
}
233
defer memFile.Close()
234
235
err = libseccomp.NotifIDValid(h.FD, req.ID)
236
if err != nil {
237
log.WithError(err).Error("invalid notify ID", req.ID)
238
return Errno(unix.EPERM)
239
}
240
241
source, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))
242
if err != nil {
243
log.WithField("arg", 0).WithError(err).Error("cannot read argument")
244
return Errno(unix.EFAULT)
245
}
246
dest, err := readarg.ReadString(memFile, int64(req.Data.Args[1]))
247
if err != nil {
248
log.WithField("arg", 1).WithError(err).Error("cannot read argument")
249
return Errno(unix.EFAULT)
250
}
251
filesystem, err := readarg.ReadString(memFile, int64(req.Data.Args[2]))
252
if err != nil {
253
log.WithField("arg", 2).WithError(err).Error("cannot read argument")
254
return Errno(unix.EFAULT)
255
}
256
257
var args string
258
if len(req.Data.Args) >= 5 && filesystem == "nfs4" {
259
args, err = readarg.ReadString(memFile, int64(req.Data.Args[4]))
260
log.WithField("arg", 4).WithError(err).Error("cannot read argument")
261
}
262
263
log.WithFields(map[string]interface{}{
264
"source": source,
265
"dest": dest,
266
"fstype": filesystem,
267
"args": args,
268
}).Info("handling mount syscall")
269
270
if filesystem == "proc" || filesystem == "sysfs" || filesystem == "nfs4" {
271
// When a process wants to mount proc relative to `/proc/self` that path has no meaning outside of the processes' context.
272
// runc started doing this in https://github.com/opencontainers/runc/commit/0ca91f44f1664da834bc61115a849b56d22f595f
273
// TODO(cw): there must be a better way to handle this. Find one.
274
target := filepath.Join(h.Ring2Rootfs, dest)
275
if strings.HasPrefix(dest, "/proc/self/") {
276
target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/self/"))
277
}
278
if strings.HasPrefix(dest, "/proc/thread-self/") {
279
target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/thread-self/"))
280
}
281
stat, err := os.Lstat(target)
282
if errors.Is(err, fs.ErrNotExist) {
283
err = os.MkdirAll(target, 0755)
284
}
285
if err != nil {
286
log.WithField("target", target).WithField("dest", dest).WithError(err).Error("cannot stat mountpoint")
287
return Errno(unix.EFAULT)
288
}
289
if stat != nil {
290
if stat.Mode()&os.ModeSymlink != 0 {
291
// The symlink is already expressed relative to the ring2 mount namespace, no need to faff with the rootfs paths.
292
// In case this was a /proc relative symlink, we'll have that symlink resolved here, hence make it work in the mount namespace of ring2.
293
dest, err = os.Readlink(target)
294
if err != nil {
295
log.WithField("target", target).WithField("dest", dest).WithError(err).Errorf("cannot resolve %s mount target symlink", filesystem)
296
return Errno(unix.EFAULT)
297
}
298
} else if stat.Mode()&os.ModeDir == 0 {
299
log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("%s must be mounted on an ordinary directory", filesystem)
300
return Errno(unix.EPERM)
301
}
302
}
303
304
wait := iwsBackoffInitialWait
305
for i := 0; i < iwsBackoffSteps; i++ {
306
err = func() error {
307
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
308
defer cancel()
309
iws, err := h.Daemon(ctx)
310
if err != nil {
311
log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("cannot get IWS client to mount %s", filesystem)
312
return err
313
}
314
defer iws.Close()
315
316
call := iws.MountProc
317
if filesystem == "sysfs" {
318
call = iws.MountSysfs
319
}
320
321
if filesystem == "sysfs" || filesystem == "proc" {
322
_, err = call(ctx, &daemonapi.MountProcRequest{
323
Target: dest,
324
Pid: int64(req.Pid),
325
})
326
} else if filesystem == "nfs4" {
327
_, err = iws.MountNfs(ctx, &daemonapi.MountNfsRequest{
328
Source: source,
329
Target: dest,
330
Args: args,
331
Pid: int64(req.Pid),
332
})
333
}
334
335
if err != nil {
336
log.WithField("target", dest).WithError(err).Errorf("cannot mount %s", filesystem)
337
return err
338
}
339
return nil
340
}()
341
if err != nil {
342
time.Sleep(wait)
343
wait = wait * iwsBackoffFactor
344
if wait > iwsBackoffMaxWait {
345
wait = iwsBackoffMaxWait
346
}
347
} else {
348
break
349
}
350
351
}
352
if err != nil {
353
// We've already logged the reason above
354
return Errno(unix.EFAULT)
355
}
356
357
return 0, 0, 0
358
}
359
360
// let the kernel do the work
361
return 0, 0, libseccomp.NotifRespFlagContinue
362
}
363
364
// Umount handles umount and umount2 syscalls
365
func (h *InWorkspaceHandler) Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {
366
nme, _ := req.Data.Syscall.GetName()
367
log := log.WithFields(map[string]interface{}{
368
"syscall": nme,
369
log.WorkspaceIDField: h.WorkspaceId,
370
"pid": req.Pid,
371
"id": req.ID,
372
})
373
374
memFile, err := readarg.OpenMem(req.Pid)
375
if err != nil {
376
log.WithError(err).Error("cannot open mem")
377
return Errno(unix.EPERM)
378
}
379
defer memFile.Close()
380
381
target, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))
382
if err != nil {
383
log.WithField("arg", 0).WithError(err).Error("cannot read argument")
384
return Errno(unix.EFAULT)
385
}
386
target = strings.TrimSuffix(target, "/")
387
388
fd, err := os.Open(fmt.Sprintf("/proc/%d/mountinfo", req.Pid))
389
if err != nil {
390
log.WithError(err).Error("cannot read mountinfo")
391
return Errno(unix.EFAULT)
392
}
393
defer fd.Close()
394
mnts, err := mountinfo.GetMountsFromReader(fd, func(i *mountinfo.Info) (skip bool, stop bool) { return false, false })
395
if err != nil {
396
log.WithError(err).Error("cannot parse mountinfo")
397
return Errno(unix.EFAULT)
398
}
399
400
procMounts := make(map[string]struct{})
401
for _, mnt := range mnts {
402
if mnt.FSType == "proc" {
403
procMounts[mnt.Mountpoint] = struct{}{}
404
}
405
}
406
407
if _, ok := procMounts[target]; ok {
408
// ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
409
// defer cancel()
410
// _, err = h.Daemon.UmountProc(ctx, &daemonapi.UmountProcRequest{
411
// Target: target,
412
// Pid: int64(req.Pid),
413
// })
414
// if err != nil {
415
// log.WithError(err).Error("cannot umount proc mount")
416
// return Errno(unix.EFAULT)
417
// }
418
419
// log.WithField("target", target).Info("umounted proc mount")
420
// return 0, 0, 0
421
422
// proc umounting doesn't work yet from ws-daemon. Instead EPERM here.
423
// In most cases that's not a problem because in-workspace proc mounts
424
// usually happen within a mount namespace anyways, for which the kernel
425
// lazy umounts everything that's just attached within that namespace.
426
// TODO(cw): make proc umounting work in ws-dameon.
427
return Errno(unix.EPERM)
428
}
429
430
var isProcMountChild bool
431
for procMount := range procMounts {
432
if strings.HasPrefix(target, procMount) {
433
isProcMountChild = true
434
break
435
}
436
}
437
if isProcMountChild {
438
log.WithField("target", target).Warn("user attempted to umount proc mask")
439
return Errno(unix.EPERM)
440
}
441
442
// let the kernel do the work
443
return 0, 0, libseccomp.NotifRespFlagContinue
444
}
445
446
func (h *InWorkspaceHandler) Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {
447
log := log.WithFields(map[string]interface{}{
448
"syscall": "bind",
449
log.WorkspaceIDField: h.WorkspaceId,
450
"pid": req.Pid,
451
"id": req.ID,
452
})
453
// We want the syscall to succeed, no matter what we do in this handler.
454
// The Kernel will execute the syscall for us.
455
defer func() {
456
val = 0
457
errno = 0
458
flags = libseccomp.NotifRespFlagContinue
459
}()
460
461
memFile, err := readarg.OpenMem(req.Pid)
462
if err != nil {
463
log.WithError(err).Error("cannot open mem")
464
return
465
}
466
defer memFile.Close()
467
468
// TODO(cw): find why this breaks
469
// err = libseccomp.NotifIDValid(fd, req.ID)
470
// if err != nil {
471
// log.WithError(err).Error("invalid notif ID")
472
// return returnErrno(unix.EPERM)
473
// }
474
475
evt := BindEvent{PID: req.Pid}
476
select {
477
case h.BindEvents <- evt:
478
default:
479
}
480
481
// socketFdB, err := readarg.ReadBytes(memFile, int64(req.Data.Args[0]), int(req.Data.Args[1]-req.Data.Args[0]))
482
// if err != nil {
483
// log.WithError(err).Error("cannot read socketfd arg")
484
// }
485
486
// socketfd := nativeEndian.Uint64(socketFdB)
487
// unix.Getsockname()
488
489
return
490
}
491
492
func (h *InWorkspaceHandler) Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {
493
log := log.WithFields(map[string]interface{}{
494
"syscall": "chown",
495
log.WorkspaceIDField: h.WorkspaceId,
496
"pid": req.Pid,
497
"id": req.ID,
498
})
499
500
memFile, err := readarg.OpenMem(req.Pid)
501
if err != nil {
502
log.WithError(err).Error("cannot open mem")
503
return
504
}
505
defer memFile.Close()
506
507
pth, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))
508
if err != nil {
509
log.WithError(err).Error("cannot open mem")
510
return
511
}
512
513
if strings.HasPrefix(pth, "/dev/pts") {
514
return 0, 0, 0
515
}
516
517
return 0, 0, libseccomp.NotifRespFlagContinue
518
}
519
520
/*
521
var nativeEndian binary.ByteOrder
522
523
func init() {
524
buf := [2]byte{}
525
*(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD)
526
527
switch buf {
528
case [2]byte{0xCD, 0xAB}:
529
nativeEndian = binary.LittleEndian
530
case [2]byte{0xAB, 0xCD}:
531
nativeEndian = binary.BigEndian
532
default:
533
panic("Could not determine native endianness.")
534
}
535
}
536
*/
537
538