Path: blob/main/components/ws-daemon/pkg/controller/workspace_operations.go
2500 views
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.1// Licensed under the GNU Affero General Public License (AGPL).2// See License.AGPL.txt in the project root for license information.34package controller56import (7"context"8"encoding/json"9"errors"10"fmt"11"io/fs"12"os"13"path/filepath"14"time"1516"github.com/gitpod-io/gitpod/common-go/log"17glog "github.com/gitpod-io/gitpod/common-go/log"18"github.com/gitpod-io/gitpod/common-go/tracing"19csapi "github.com/gitpod-io/gitpod/content-service/api"20"github.com/gitpod-io/gitpod/content-service/pkg/archive"21wsinit "github.com/gitpod-io/gitpod/content-service/pkg/initializer"22"github.com/gitpod-io/gitpod/content-service/pkg/logs"23"github.com/gitpod-io/gitpod/content-service/pkg/storage"24"github.com/gitpod-io/gitpod/ws-daemon/pkg/content"25"github.com/gitpod-io/gitpod/ws-daemon/pkg/dispatch"26"github.com/gitpod-io/gitpod/ws-daemon/pkg/internal/session"27workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"28"github.com/opentracing/opentracing-go"29"github.com/prometheus/client_golang/prometheus"30"github.com/sirupsen/logrus"31"golang.org/x/xerrors"32)3334type Metrics struct {35BackupWaitingTimeHist prometheus.Histogram36BackupWaitingTimeoutCounter prometheus.Counter37InitializerHistogram *prometheus.HistogramVec38}3940func registerConcurrentBackupMetrics(reg prometheus.Registerer, suffix string) (prometheus.Histogram, prometheus.Counter, error) {41backupWaitingTime := prometheus.NewHistogram(prometheus.HistogramOpts{42Name: "concurrent_backup_waiting_seconds" + suffix,43Help: "waiting time for concurrent backups to finish",44Buckets: []float64{5, 10, 30, 60, 120, 180, 300, 600, 1800},45})4647err := reg.Register(backupWaitingTime)48if err != nil {49return nil, nil, xerrors.Errorf("cannot register Prometheus histogram for backup waiting time: %w", err)50}5152waitingTimeoutCounter := prometheus.NewCounter(prometheus.CounterOpts{53Name: "concurrent_backup_waiting_timeout_total" + suffix,54Help: "total count of backup rate limiting timeouts",55})56err = reg.Register(waitingTimeoutCounter)57if err != nil {58return nil, nil, xerrors.Errorf("cannot register Prometheus counter for backup waiting timeouts: %w", err)59}6061return backupWaitingTime, waitingTimeoutCounter, nil62}6364//go:generate sh -c "go install github.com/golang/mock/[email protected] && mockgen -destination=mock.go -package=controller . WorkspaceOperations"65type WorkspaceOperations interface {66// InitWorkspace initializes the workspace content67InitWorkspace(ctx context.Context, options InitOptions) (*csapi.InitializerMetrics, string, error)68// BackupWorkspace backups the content of the workspace69BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error)70// DeleteWorkspace deletes the content of the workspace from disk71DeleteWorkspace(ctx context.Context, instanceID string) error72// WipeWorkspace deletes all references to the workspace. Does not fail if parts are already gone, or state is incosistent.73WipeWorkspace(ctx context.Context, instanceID string) error74// SnapshotIDs generates the name and url for a snapshot75SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error)76// Snapshot takes a snapshot of the workspace77Snapshot(ctx context.Context, instanceID, snapshotName string) (err error)78// Setup ensures that the workspace has been setup79SetupWorkspace(ctx context.Context, instanceID string, imageInfo *workspacev1.WorkspaceImageInfo) error80}8182type DefaultWorkspaceOperations struct {83config content.Config84provider *WorkspaceProvider85backupWorkspaceLimiter chan struct{}86metrics *Metrics87dispatch *dispatch.Dispatch88}8990var _ WorkspaceOperations = (*DefaultWorkspaceOperations)(nil)9192type WorkspaceMeta struct {93Owner string94WorkspaceID string95InstanceID string96}9798type InitOptions struct {99Meta WorkspaceMeta100Initializer *csapi.WorkspaceInitializer101Headless bool102StorageQuota int103}104105type BackupOptions struct {106Meta WorkspaceMeta107BackupLogs bool108UpdateGitStatus bool109SnapshotName string110SkipBackupContent bool111}112113func NewWorkspaceOperations(config content.Config, provider *WorkspaceProvider, reg prometheus.Registerer, dispatch *dispatch.Dispatch) (WorkspaceOperations, error) {114waitingTimeHist, waitingTimeoutCounter, err := registerConcurrentBackupMetrics(reg, "_mk2")115if err != nil {116return nil, err117}118119return &DefaultWorkspaceOperations{120config: config,121provider: provider,122metrics: &Metrics{123BackupWaitingTimeHist: waitingTimeHist,124BackupWaitingTimeoutCounter: waitingTimeoutCounter,125},126// we permit five concurrent backups at any given time, hence the five in the channel127backupWorkspaceLimiter: make(chan struct{}, 5),128dispatch: dispatch,129}, nil130}131132func (wso *DefaultWorkspaceOperations) InitWorkspace(ctx context.Context, options InitOptions) (*csapi.InitializerMetrics, string, error) {133ws, err := wso.provider.NewWorkspace(ctx, options.Meta.InstanceID, filepath.Join(wso.provider.Location, options.Meta.InstanceID),134wso.creator(options.Meta.Owner, options.Meta.WorkspaceID, options.Meta.InstanceID, options.Initializer, false, options.StorageQuota))135136if err != nil {137return nil, "bug: cannot add workspace to store", xerrors.Errorf("cannot add workspace to store: %w", err)138}139140rs, ok := ws.NonPersistentAttrs[session.AttrRemoteStorage].(storage.DirectAccess)141if rs == nil || !ok {142return nil, "bug: workspace has no remote storage", xerrors.Errorf("workspace has no remote storage")143}144ps, err := storage.NewPresignedAccess(&wso.config.Storage)145if err != nil {146return nil, "bug: no presigned storage available", xerrors.Errorf("no presigned storage available: %w", err)147}148149remoteContent, err := content.CollectRemoteContent(ctx, rs, ps, options.Meta.Owner, options.Initializer)150if err != nil {151return nil, "remote content error", xerrors.Errorf("remote content error: %w", err)152}153154// Initialize workspace.155// FWB workspaces initialize without the help of ws-daemon, but using their supervisor or the registry-facade.156opts := content.RunInitializerOpts{157Command: wso.config.Initializer.Command,158Args: wso.config.Initializer.Args,159// This is a bit of a hack as it makes hard assumptions about the nature of the UID mapping.160// Also, we cannot do this in wsinit because we're dropping all the privileges that would be161// required for this operation.162//163// With FWB this bit becomes unneccesary.164UID: (wsinit.GitpodUID + 100000 - 1),165GID: (wsinit.GitpodGID + 100000 - 1),166IdMappings: []archive.IDMapping{167{ContainerID: 0, HostID: wsinit.GitpodUID, Size: 1},168{ContainerID: 1, HostID: 100000, Size: 65534},169},170OWI: content.OWI{171Owner: options.Meta.Owner,172WorkspaceID: options.Meta.WorkspaceID,173InstanceID: options.Meta.InstanceID,174},175}176177err = ensureCleanSlate(ws.Location)178if err != nil {179glog.WithFields(ws.OWI()).Warnf("cannot ensure clean slate for workspace %s (this might break content init): %v", ws.InstanceID, err)180}181182stats, err := content.RunInitializer(ctx, ws.Location, options.Initializer, remoteContent, opts)183if err != nil {184glog.WithFields(ws.OWI()).Infof("error running initializer %v", err)185return nil, err.Error(), err186}187188err = ws.Persist()189if err != nil {190return nil, "cannot persist workspace", err191}192193glog.WithFields(ws.OWI()).Debug("content init done")194195return stats, "", nil196}197198func (wso *DefaultWorkspaceOperations) creator(owner, workspaceID, instanceID string, init *csapi.WorkspaceInitializer, storageDisabled bool, storageQuota int) WorkspaceFactory {199var checkoutLocation string200allLocations := csapi.GetCheckoutLocationsFromInitializer(init)201if len(allLocations) > 0 {202checkoutLocation = allLocations[0]203}204205serviceDirName := instanceID + "-daemon"206return func(ctx context.Context, location string) (res *session.Workspace, err error) {207return &session.Workspace{208Location: location,209CheckoutLocation: checkoutLocation,210CreatedAt: time.Now(),211Owner: owner,212WorkspaceID: workspaceID,213InstanceID: instanceID,214RemoteStorageDisabled: storageDisabled,215StorageQuota: storageQuota,216217ServiceLocDaemon: filepath.Join(wso.config.WorkingArea, serviceDirName),218ServiceLocNode: filepath.Join(wso.config.WorkingAreaNode, serviceDirName),219}, nil220}221}222223func (wso *DefaultWorkspaceOperations) SetupWorkspace(ctx context.Context, instanceID string, imageInfo *workspacev1.WorkspaceImageInfo) error {224ws, err := wso.provider.GetAndConnect(ctx, instanceID)225if err != nil {226return fmt.Errorf("cannot setup workspace %s: %w", instanceID, err)227}228err = wso.writeImageInfo(ctx, ws, imageInfo)229if err != nil {230glog.WithError(err).WithFields(ws.OWI()).Error("cannot write image info")231}232return nil233}234235func (wso *DefaultWorkspaceOperations) BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error) {236ws, err := wso.provider.GetAndConnect(ctx, opts.Meta.InstanceID)237if err != nil {238return nil, fmt.Errorf("cannot find workspace %s during DisposeWorkspace: %w", opts.Meta.InstanceID, err)239}240241if ws.RemoteStorageDisabled {242return nil, fmt.Errorf("workspace has no remote storage")243}244245if opts.BackupLogs {246err := wso.uploadWorkspaceLogs(ctx, opts, ws.Location)247if err != nil {248// we do not fail the workspace yet because we still might succeed with its content!249glog.WithError(err).WithFields(ws.OWI()).Error("log backup failed")250}251}252253if opts.SkipBackupContent {254return nil, nil255}256257err = wso.uploadWorkspaceContent(ctx, ws, opts.SnapshotName)258if err != nil {259glog.WithError(err).WithFields(ws.OWI()).Error("final backup failed for workspace")260return nil, fmt.Errorf("final backup failed for workspace %s", opts.Meta.InstanceID)261}262263var repo *csapi.GitStatus264if opts.UpdateGitStatus {265// Update the git status prior to deleting the workspace266repo, err = ws.UpdateGitStatus(ctx)267if err != nil {268// do not fail workspace because we were unable to get git status269// which can happen for various reasons, including user corrupting his .git folder somehow270// instead we log the error and continue cleaning up workspace271// todo(pavel): it would be great if we can somehow bubble this up to user without failing workspace272glog.WithError(err).WithFields(ws.OWI()).Warn("cannot get git status")273}274}275276return repo, nil277}278279func (wso *DefaultWorkspaceOperations) DeleteWorkspace(ctx context.Context, instanceID string) error {280ws, err := wso.provider.GetAndConnect(ctx, instanceID)281if err != nil {282return fmt.Errorf("cannot find workspace %s during DisposeWorkspace: %w", instanceID, err)283}284285if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil {286glog.WithError(err).WithFields(ws.OWI()).Error("cannot dispose session")287return err288}289290// remove workspace daemon directory in the node291if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {292glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory")293return err294}295wso.provider.Remove(ctx, instanceID)296297return nil298}299300func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instanceID string) error {301log := log.New().WithContext(ctx)302303ws, err := wso.provider.GetAndConnect(ctx, instanceID)304if err != nil {305// we have to assume everything is fine, and this workspace has already been completely wiped306return nil307}308log = log.WithFields(ws.OWI())309310// mark this session as being wiped311ws.DoWipe = true312313if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil {314log.WithError(err).Error("cannot dispose session")315return err316}317318// dispose all running "dispatch handlers", e.g. all code running on the "pod informer"-triggered part of ws-daemon319wso.dispatch.DisposeWorkspace(ctx, instanceID)320321// remove workspace daemon directory in the node322removedChan := make(chan struct{}, 1)323go func() {324defer close(removedChan)325326if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {327log.WithError(err).Warn("cannot delete workspace daemon directory, leaving it dangling...")328}329}()330331// We never want the "RemoveAll" to block the workspace from being delete, so we'll resort to make this a best-effort approach, and time out after 10s.332timeout := time.NewTicker(10 * time.Second)333defer timeout.Stop()334select {335case <-timeout.C:336case <-removedChan:337log.Debug("successfully removed workspace daemon directory")338}339340// remove the reference from the WorkspaceProvider, e.g. the "workspace controller" part of ws-daemon341wso.provider.Remove(ctx, instanceID)342343return nil344}345346func (wso *DefaultWorkspaceOperations) SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) {347sess, err := wso.provider.GetAndConnect(ctx, instanceID)348if err != nil {349return "", "", fmt.Errorf("cannot find workspace %s during SnapshotName: %w", instanceID, err)350}351352baseName := fmt.Sprintf("snapshot-%d", time.Now().UnixNano())353snapshotName = baseName + ".tar"354355rs, ok := sess.NonPersistentAttrs[session.AttrRemoteStorage].(storage.DirectAccess)356if rs == nil || !ok {357return "", "", fmt.Errorf("no remote storage configured")358}359360return rs.Qualify(snapshotName), snapshotName, nil361}362363func (wso *DefaultWorkspaceOperations) Snapshot(ctx context.Context, workspaceID, snapshotName string) (err error) {364//nolint:ineffassign365span, ctx := opentracing.StartSpanFromContext(ctx, "TakeSnapshot")366span.SetTag("workspace", workspaceID)367defer tracing.FinishSpan(span, &err)368369if workspaceID == "" {370return fmt.Errorf("workspaceID is required")371}372373ws, err := wso.provider.GetAndConnect(ctx, workspaceID)374if err != nil {375return fmt.Errorf("cannot find workspace %s during DisposeWorkspace", workspaceID)376}377378if ws.RemoteStorageDisabled {379return fmt.Errorf("workspace has no remote storage")380}381382err = wso.uploadWorkspaceContent(ctx, ws, snapshotName)383if err != nil {384glog.WithError(err).WithFields(ws.OWI()).Error("snapshot failed for workspace")385return fmt.Errorf("snapshot failed for workspace %s", workspaceID)386}387388return nil389}390391func ensureCleanSlate(location string) error {392// do not remove the location itself but only393// the children394files, err := os.ReadDir(location)395if err != nil {396return err397}398399for _, f := range files {400path := filepath.Join(location, f.Name())401err = os.RemoveAll(path)402if err != nil {403return err404}405}406407return nil408}409410func (wso *DefaultWorkspaceOperations) uploadWorkspaceLogs(ctx context.Context, opts BackupOptions, location string) (err error) {411// currently we're only uploading prebuild log files412logFiles, err := logs.ListPrebuildLogFiles(ctx, location)413if err != nil {414return err415}416417rs, err := storage.NewDirectAccess(&wso.config.Storage)418if err != nil {419return xerrors.Errorf("cannot use configured storage: %w", err)420}421422err = rs.Init(ctx, opts.Meta.Owner, opts.Meta.WorkspaceID, opts.Meta.InstanceID)423if err != nil {424return xerrors.Errorf("cannot use configured storage: %w", err)425}426427err = rs.EnsureExists(ctx)428if err != nil {429return err430}431432for _, absLogPath := range logFiles {433taskID, parseErr := logs.ParseTaskIDFromPrebuildLogFilePath(absLogPath)434owi := glog.OWI(opts.Meta.Owner, opts.Meta.WorkspaceID, opts.Meta.InstanceID)435if parseErr != nil {436glog.WithError(parseErr).WithFields(owi).Warn("cannot parse headless workspace log file name")437continue438}439440err = retryIfErr(ctx, 5, glog.WithField("op", "upload log").WithFields(owi), func(ctx context.Context) (err error) {441_, _, err = rs.UploadInstance(ctx, absLogPath, logs.UploadedHeadlessLogPath(taskID))442if err != nil {443return444}445446return447})448if err != nil {449return xerrors.Errorf("cannot upload workspace logs: %w", err)450}451}452return err453}454455func (wso *DefaultWorkspaceOperations) uploadWorkspaceContent(ctx context.Context, sess *session.Workspace, backupName string) error {456// Avoid too many simultaneous backups in order to avoid excessive memory utilization.457var timedOut bool458waitStart := time.Now()459select {460case wso.backupWorkspaceLimiter <- struct{}{}:461case <-time.After(15 * time.Minute):462// we timed out on the rate limit - let's upload anyways, because we don't want to actually block463// an upload. If we reach this point, chances are other things are broken. No upload should ever464// take this long.465timedOut = true466wso.metrics.BackupWaitingTimeoutCounter.Inc()467}468469waitTime := time.Since(waitStart)470wso.metrics.BackupWaitingTimeHist.Observe(waitTime.Seconds())471472defer func() {473// timeout -> we did not add to the limiter474if timedOut {475return476}477478<-wso.backupWorkspaceLimiter479}()480481var (482loc = sess.Location483opts []storage.UploadOption484)485486err := os.Remove(filepath.Join(sess.Location, wsinit.WorkspaceReadyFile))487if err != nil && !errors.Is(err, fs.ErrNotExist) {488// We'll still upload the backup, well aware that the UX during restart will be broken.489// But it's better to have a backup with all files (albeit one too many), than having no backup at all.490glog.WithError(err).WithFields(sess.OWI()).Warn("cannot remove workspace ready file")491}492493rs, ok := sess.NonPersistentAttrs[session.AttrRemoteStorage].(storage.DirectAccess)494if rs == nil || !ok {495return xerrors.Errorf("no remote storage configured")496}497498var (499tmpf *os.File500tmpfSize int64501)502503defer func() {504if tmpf != nil {505os.Remove(tmpf.Name())506}507}()508509err = retryIfErr(ctx, wso.config.Backup.Attempts, glog.WithFields(sess.OWI()).WithField("op", "create archive"), func(ctx context.Context) (err error) {510tmpf, err = os.CreateTemp(wso.config.TmpDir, fmt.Sprintf("wsbkp-%s-*.tar", sess.InstanceID))511if err != nil {512return513}514515defer func() {516tmpf.Close()517if err != nil {518os.Remove(tmpf.Name())519}520}()521522var opts []archive.TarOption523opts = append(opts)524mappings := []archive.IDMapping{525{ContainerID: 0, HostID: wsinit.GitpodUID, Size: 1},526{ContainerID: 1, HostID: 100000, Size: 65534},527}528opts = append(opts,529archive.WithUIDMapping(mappings),530archive.WithGIDMapping(mappings),531)532533err = content.BuildTarbal(ctx, loc, tmpf.Name(), opts...)534if err != nil {535return536}537err = tmpf.Sync()538if err != nil {539return540}541_, err = tmpf.Seek(0, 0)542if err != nil {543return544}545546stat, err := tmpf.Stat()547if err != nil {548return549}550tmpfSize = stat.Size()551glog.WithField("size", tmpfSize).WithField("location", tmpf.Name()).WithFields(sess.OWI()).Debug("created temp file for workspace backup upload")552553return554})555if err != nil {556return xerrors.Errorf("cannot create archive: %w", err)557}558559err = retryIfErr(ctx, wso.config.Backup.Attempts, glog.WithFields(sess.OWI()).WithField("op", "upload layer"), func(ctx context.Context) (err error) {560_, _, err = rs.Upload(ctx, tmpf.Name(), backupName, opts...)561if err != nil {562return563}564565return566})567if err != nil {568return xerrors.Errorf("cannot upload workspace content: %w", err)569}570571return nil572}573574func (wso *DefaultWorkspaceOperations) writeImageInfo(_ context.Context, ws *session.Workspace, imageInfo *workspacev1.WorkspaceImageInfo) error {575if imageInfo == nil {576return nil577}578579b, err := json.Marshal(imageInfo)580if err != nil {581return fmt.Errorf("cannot marshal image info: %w", err)582}583uid := (wsinit.GitpodUID + 100000 - 1)584gid := (wsinit.GitpodGID + 100000 - 1)585fp := filepath.Join(ws.Location, ".gitpod/image")586err = os.WriteFile(fp, b, 0644)587if err != nil {588return fmt.Errorf("cannot write image info: %w", err)589}590os.Chown(fp, uid, gid)591return nil592}593594func retryIfErr(ctx context.Context, attempts int, log *logrus.Entry, op func(ctx context.Context) error) (err error) {595//nolint:ineffassign596span, ctx := opentracing.StartSpanFromContext(ctx, "retryIfErr")597defer tracing.FinishSpan(span, &err)598for k, v := range log.Data {599span.LogKV(k, v)600}601602if attempts == 0 {603attempts = 1604}605606backoff := 1 * time.Second607for i := 0; i < attempts; i++ {608span.LogKV("attempt", i)609610if cerr := ctx.Err(); cerr != nil {611return cerr612}613614bctx, cancel := context.WithCancel(ctx)615err = op(bctx)616cancel()617if err == nil {618break619}620621log.WithError(err).Error("op failed")622span.LogKV("error", err.Error())623if i < attempts-1 {624log.WithField("backoff", backoff.String()).Debug("retrying op after backoff")625if cerr := ctx.Err(); cerr != nil {626return cerr627}628time.Sleep(backoff)629backoff = 2 * backoff630}631}632return633}634635636