Path: blob/main/components/workspacekit/pkg/seccomp/notify.go
2500 views
// Copyright (c) 2021 Gitpod GmbH. All rights reserved.1// Licensed under the GNU Affero General Public License (AGPL).2// See License.AGPL.txt in the project root for license information.34package seccomp56import (7"context"8"errors"9"fmt"10"io"11"io/fs"12"os"13"path/filepath"14"strconv"15"strings"16"syscall"17"time"1819"github.com/moby/sys/mountinfo"20"golang.org/x/sys/unix"21"golang.org/x/xerrors"2223"github.com/gitpod-io/gitpod/common-go/log"24"github.com/gitpod-io/gitpod/workspacekit/pkg/readarg"25daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api"26libseccomp "github.com/seccomp/libseccomp-golang"27)2829type syscallHandler func(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)3031// SyscallHandler handles seccomp syscall notifications32type SyscallHandler interface {33Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)34Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)35Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)36Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32)37}3839func mapHandler(h SyscallHandler) map[string]syscallHandler {40return map[string]syscallHandler{41"mount": h.Mount,42"umount": h.Umount,43"umount2": h.Umount,44"bind": h.Bind,45"chown": h.Chown,46}47}4849// LoadFilter loads the syscall filter required to make the handler work.50// Calling this function has a range of side-effects:51// - we'll lock the caller using `runtime.LockOSThread()`52// - we'll set no_new_privs on the process53func LoadFilter() (libseccomp.ScmpFd, error) {54filter, err := libseccomp.NewFilter(libseccomp.ActAllow)55if err != nil {56return 0, xerrors.Errorf("cannot create filter: %w", err)57}58err = filter.SetTsync(false)59if err != nil {60return 0, xerrors.Errorf("cannot set tsync: %w", err)61}62err = filter.SetNoNewPrivsBit(false)63if err != nil {64return 0, xerrors.Errorf("cannot set no_new_privs: %w", err)65}6667// we explicitly prohibit open_tree/move_mount to prevent container workloads68// from moving a proc mask using open_tree(..., CLONE|RECURSIVE).69deniedSyscalls := []string{70"open_tree",71"move_mount",72}73for _, sc := range deniedSyscalls {74syscallID, err := libseccomp.GetSyscallFromName(sc)75if err != nil {76return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err)77}78err = filter.AddRule(syscallID, libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)))79if err != nil {80return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err)81}82}8384handledSyscalls := mapHandler(&InWorkspaceHandler{})85for sc := range handledSyscalls {86syscallID, err := libseccomp.GetSyscallFromName(sc)87if err != nil {88return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err)89}90err = filter.AddRule(syscallID, libseccomp.ActNotify)91if err != nil {92return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err)93}94}9596err = filter.Load()97if err != nil {98return 0, xerrors.Errorf("cannot load filter: %w", err)99}100101fd, err := filter.GetNotifFd()102if err != nil {103return 0, xerrors.Errorf("cannot get inotif fd: %w", err)104}105106return fd, nil107}108109const (110// IWS backoff is the backoff configuration we use for interacting with the in-workspace service111iwsBackoffInitialWait = 10 * time.Millisecond112iwsBackoffSteps = 6113iwsBackoffFactor = 5114iwsBackoffMaxWait = 2500 * time.Millisecond115)116117// Handle actually listens on the seccomp notif FD and handles incoming requests.118// This function returns when the notif FD is closed.119func Handle(fd libseccomp.ScmpFd, handler SyscallHandler, wsid string) (stop chan<- struct{}, errchan <-chan error) {120log := log.WithField("workspaceId", wsid)121122ec := make(chan error)123stp := make(chan struct{})124125handledSyscalls := mapHandler(handler)126go func() {127for {128req, err := libseccomp.NotifReceive(fd)129if err != nil {130if err == syscall.ENOENT {131log.WithError(err).Warn("failed to get notification beucase it has already been not valid anymore(the kernel sets that)")132continue133}134135log.WithError(err).Error("failed to get notification")136ec <- err137if err == unix.ECANCELED {138return139}140141continue142}143select {144case <-stp:145// if we're asked stop we might still have to answer a syscall.146// We do this on a best effort basis answering with EPERM.147if err != nil {148_ = libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{149ID: req.ID,150Error: 1,151Val: 0,152Flags: 0,153})154}155default:156}157158go func() {159syscallName, _ := req.Data.Syscall.GetName()160161handler, ok := handledSyscalls[syscallName]162if !ok {163handler = handleUnknownSyscall164}165val, errno, flags := handler(req)166167ierr := libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{168ID: req.ID,169Error: errno,170Val: val,171Flags: flags,172})173if ierr != nil {174log.WithError(ierr).Error("failed to return notification response")175ec <- ierr176}177}()178}179}()180181return stp, ec182}183184func handleUnknownSyscall(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {185nme, _ := req.Data.Syscall.GetName()186log.WithField("syscall", nme).Warn("don't know how to handle this syscall")187return 0, 1, 0188}189190func Errno(err unix.Errno) (val uint64, errno int32, flags uint32) {191return ^uint64(0), int32(errno), 0192}193194// IWSClientProvider provides a client to the in-workspace-service.195// Consumers of this provider will close the client after use.196type IWSClientProvider func(ctx context.Context) (InWorkspaceServiceClient, error)197198type InWorkspaceServiceClient interface {199daemonapi.InWorkspaceServiceClient200io.Closer201}202203// InWorkspaceHandler is the seccomp notification handler that serves a Gitpod workspace204type InWorkspaceHandler struct {205FD libseccomp.ScmpFd206Daemon IWSClientProvider207Ring2PID int208Ring2Rootfs string209BindEvents chan<- BindEvent210WorkspaceId string211}212213// BindEvent describes a process binding to a socket214type BindEvent struct {215PID uint32216}217218// Mount handles mount syscalls219func (h *InWorkspaceHandler) Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {220log := log.WithFields(map[string]interface{}{221"syscall": "mount",222log.WorkspaceIDField: h.WorkspaceId,223"pid": req.Pid,224"id": req.ID,225})226227memFile, err := readarg.OpenMem(req.Pid)228if err != nil {229log.WithError(err).Error("cannot open mem")230return Errno(unix.EPERM)231}232defer memFile.Close()233234err = libseccomp.NotifIDValid(h.FD, req.ID)235if err != nil {236log.WithError(err).Error("invalid notify ID", req.ID)237return Errno(unix.EPERM)238}239240source, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))241if err != nil {242log.WithField("arg", 0).WithError(err).Error("cannot read argument")243return Errno(unix.EFAULT)244}245dest, err := readarg.ReadString(memFile, int64(req.Data.Args[1]))246if err != nil {247log.WithField("arg", 1).WithError(err).Error("cannot read argument")248return Errno(unix.EFAULT)249}250filesystem, err := readarg.ReadString(memFile, int64(req.Data.Args[2]))251if err != nil {252log.WithField("arg", 2).WithError(err).Error("cannot read argument")253return Errno(unix.EFAULT)254}255256var args string257if len(req.Data.Args) >= 5 && filesystem == "nfs4" {258args, err = readarg.ReadString(memFile, int64(req.Data.Args[4]))259log.WithField("arg", 4).WithError(err).Error("cannot read argument")260}261262log.WithFields(map[string]interface{}{263"source": source,264"dest": dest,265"fstype": filesystem,266"args": args,267}).Info("handling mount syscall")268269if filesystem == "proc" || filesystem == "sysfs" || filesystem == "nfs4" {270// When a process wants to mount proc relative to `/proc/self` that path has no meaning outside of the processes' context.271// runc started doing this in https://github.com/opencontainers/runc/commit/0ca91f44f1664da834bc61115a849b56d22f595f272// TODO(cw): there must be a better way to handle this. Find one.273target := filepath.Join(h.Ring2Rootfs, dest)274if strings.HasPrefix(dest, "/proc/self/") {275target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/self/"))276}277if strings.HasPrefix(dest, "/proc/thread-self/") {278target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/thread-self/"))279}280stat, err := os.Lstat(target)281if errors.Is(err, fs.ErrNotExist) {282err = os.MkdirAll(target, 0755)283}284if err != nil {285log.WithField("target", target).WithField("dest", dest).WithError(err).Error("cannot stat mountpoint")286return Errno(unix.EFAULT)287}288if stat != nil {289if stat.Mode()&os.ModeSymlink != 0 {290// The symlink is already expressed relative to the ring2 mount namespace, no need to faff with the rootfs paths.291// In case this was a /proc relative symlink, we'll have that symlink resolved here, hence make it work in the mount namespace of ring2.292dest, err = os.Readlink(target)293if err != nil {294log.WithField("target", target).WithField("dest", dest).WithError(err).Errorf("cannot resolve %s mount target symlink", filesystem)295return Errno(unix.EFAULT)296}297} else if stat.Mode()&os.ModeDir == 0 {298log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("%s must be mounted on an ordinary directory", filesystem)299return Errno(unix.EPERM)300}301}302303wait := iwsBackoffInitialWait304for i := 0; i < iwsBackoffSteps; i++ {305err = func() error {306ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)307defer cancel()308iws, err := h.Daemon(ctx)309if err != nil {310log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("cannot get IWS client to mount %s", filesystem)311return err312}313defer iws.Close()314315call := iws.MountProc316if filesystem == "sysfs" {317call = iws.MountSysfs318}319320if filesystem == "sysfs" || filesystem == "proc" {321_, err = call(ctx, &daemonapi.MountProcRequest{322Target: dest,323Pid: int64(req.Pid),324})325} else if filesystem == "nfs4" {326_, err = iws.MountNfs(ctx, &daemonapi.MountNfsRequest{327Source: source,328Target: dest,329Args: args,330Pid: int64(req.Pid),331})332}333334if err != nil {335log.WithField("target", dest).WithError(err).Errorf("cannot mount %s", filesystem)336return err337}338return nil339}()340if err != nil {341time.Sleep(wait)342wait = wait * iwsBackoffFactor343if wait > iwsBackoffMaxWait {344wait = iwsBackoffMaxWait345}346} else {347break348}349350}351if err != nil {352// We've already logged the reason above353return Errno(unix.EFAULT)354}355356return 0, 0, 0357}358359// let the kernel do the work360return 0, 0, libseccomp.NotifRespFlagContinue361}362363// Umount handles umount and umount2 syscalls364func (h *InWorkspaceHandler) Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {365nme, _ := req.Data.Syscall.GetName()366log := log.WithFields(map[string]interface{}{367"syscall": nme,368log.WorkspaceIDField: h.WorkspaceId,369"pid": req.Pid,370"id": req.ID,371})372373memFile, err := readarg.OpenMem(req.Pid)374if err != nil {375log.WithError(err).Error("cannot open mem")376return Errno(unix.EPERM)377}378defer memFile.Close()379380target, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))381if err != nil {382log.WithField("arg", 0).WithError(err).Error("cannot read argument")383return Errno(unix.EFAULT)384}385target = strings.TrimSuffix(target, "/")386387fd, err := os.Open(fmt.Sprintf("/proc/%d/mountinfo", req.Pid))388if err != nil {389log.WithError(err).Error("cannot read mountinfo")390return Errno(unix.EFAULT)391}392defer fd.Close()393mnts, err := mountinfo.GetMountsFromReader(fd, func(i *mountinfo.Info) (skip bool, stop bool) { return false, false })394if err != nil {395log.WithError(err).Error("cannot parse mountinfo")396return Errno(unix.EFAULT)397}398399procMounts := make(map[string]struct{})400for _, mnt := range mnts {401if mnt.FSType == "proc" {402procMounts[mnt.Mountpoint] = struct{}{}403}404}405406if _, ok := procMounts[target]; ok {407// ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)408// defer cancel()409// _, err = h.Daemon.UmountProc(ctx, &daemonapi.UmountProcRequest{410// Target: target,411// Pid: int64(req.Pid),412// })413// if err != nil {414// log.WithError(err).Error("cannot umount proc mount")415// return Errno(unix.EFAULT)416// }417418// log.WithField("target", target).Info("umounted proc mount")419// return 0, 0, 0420421// proc umounting doesn't work yet from ws-daemon. Instead EPERM here.422// In most cases that's not a problem because in-workspace proc mounts423// usually happen within a mount namespace anyways, for which the kernel424// lazy umounts everything that's just attached within that namespace.425// TODO(cw): make proc umounting work in ws-dameon.426return Errno(unix.EPERM)427}428429var isProcMountChild bool430for procMount := range procMounts {431if strings.HasPrefix(target, procMount) {432isProcMountChild = true433break434}435}436if isProcMountChild {437log.WithField("target", target).Warn("user attempted to umount proc mask")438return Errno(unix.EPERM)439}440441// let the kernel do the work442return 0, 0, libseccomp.NotifRespFlagContinue443}444445func (h *InWorkspaceHandler) Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {446log := log.WithFields(map[string]interface{}{447"syscall": "bind",448log.WorkspaceIDField: h.WorkspaceId,449"pid": req.Pid,450"id": req.ID,451})452// We want the syscall to succeed, no matter what we do in this handler.453// The Kernel will execute the syscall for us.454defer func() {455val = 0456errno = 0457flags = libseccomp.NotifRespFlagContinue458}()459460memFile, err := readarg.OpenMem(req.Pid)461if err != nil {462log.WithError(err).Error("cannot open mem")463return464}465defer memFile.Close()466467// TODO(cw): find why this breaks468// err = libseccomp.NotifIDValid(fd, req.ID)469// if err != nil {470// log.WithError(err).Error("invalid notif ID")471// return returnErrno(unix.EPERM)472// }473474evt := BindEvent{PID: req.Pid}475select {476case h.BindEvents <- evt:477default:478}479480// socketFdB, err := readarg.ReadBytes(memFile, int64(req.Data.Args[0]), int(req.Data.Args[1]-req.Data.Args[0]))481// if err != nil {482// log.WithError(err).Error("cannot read socketfd arg")483// }484485// socketfd := nativeEndian.Uint64(socketFdB)486// unix.Getsockname()487488return489}490491func (h *InWorkspaceHandler) Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) {492log := log.WithFields(map[string]interface{}{493"syscall": "chown",494log.WorkspaceIDField: h.WorkspaceId,495"pid": req.Pid,496"id": req.ID,497})498499memFile, err := readarg.OpenMem(req.Pid)500if err != nil {501log.WithError(err).Error("cannot open mem")502return503}504defer memFile.Close()505506pth, err := readarg.ReadString(memFile, int64(req.Data.Args[0]))507if err != nil {508log.WithError(err).Error("cannot open mem")509return510}511512if strings.HasPrefix(pth, "/dev/pts") {513return 0, 0, 0514}515516return 0, 0, libseccomp.NotifRespFlagContinue517}518519/*520var nativeEndian binary.ByteOrder521522func init() {523buf := [2]byte{}524*(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD)525526switch buf {527case [2]byte{0xCD, 0xAB}:528nativeEndian = binary.LittleEndian529case [2]byte{0xAB, 0xCD}:530nativeEndian = binary.BigEndian531default:532panic("Could not determine native endianness.")533}534}535*/536537538