Path: blob/main/components/ws-manager-bridge/src/workspace-instance-controller.ts
2498 views
/**1* Copyright (c) 2022 Gitpod GmbH. All rights reserved.2* Licensed under the GNU Affero General Public License (AGPL).3* See License.AGPL.txt in the project root for license information.4*/56import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing";7import { GetWorkspacesRequest } from "@gitpod/ws-manager/lib";8import { Disposable, DisposableCollection, RunningWorkspaceInfo, WorkspaceInstance } from "@gitpod/gitpod-protocol";9import { inject, injectable } from "inversify";10import { Configuration } from "./config";11import { log, LogContext } from "@gitpod/gitpod-protocol/lib/util/logging";12import { Metrics } from "./metrics";13import { WorkspaceDB } from "@gitpod/gitpod-db/lib/workspace-db";14import { DBWithTracing, TracedUserDB, TracedWorkspaceDB } from "@gitpod/gitpod-db/lib/traced-db";15import { UserDB } from "@gitpod/gitpod-db/lib/user-db";16import { IAnalyticsWriter } from "@gitpod/gitpod-protocol/lib/analytics";17import { ClientProvider } from "./wsman-subscriber";18import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat";19import { PrebuildUpdater } from "./prebuild-updater";20import { RedisPublisher } from "@gitpod/gitpod-db/lib";21import { durationLongerThanSeconds } from "@gitpod/gitpod-protocol/lib/util/timeutil";22import { scrubber } from "@gitpod/gitpod-protocol/lib/util/scrubbing";2324export const WorkspaceInstanceController = Symbol("WorkspaceInstanceController");2526export interface WorkspaceInstanceController extends Disposable {27start(28workspaceClusterName: string,29clientProvider: ClientProvider,30controllerIntervalSeconds: number,31controllerMaxDisconnectSeconds: number,32): void;3334controlNotStoppedAppClusterManagedInstanceTimeouts(35parentCtx: TraceContext,36runningInstances: RunningWorkspaceInfo[],37workspaceClusterName: string,38): Promise<void>;3940onStopped(ctx: TraceContext, ownerUserID: string, instance: WorkspaceInstance): Promise<void>;41}4243/**44* This class is responsible for controlling the WorkspaceInstances that are not stopped and to ensure that there45* actual state is properly reflected in the database, eventually.46*47* !!! It's statful, so make sure it's bound in transient mode !!!48*/49@injectable()50export class WorkspaceInstanceControllerImpl implements WorkspaceInstanceController, Disposable {51constructor(52@inject(Configuration) private readonly config: Configuration,53@inject(Metrics) private readonly prometheusExporter: Metrics,54@inject(TracedWorkspaceDB) private readonly workspaceDB: DBWithTracing<WorkspaceDB>,55@inject(TracedUserDB) private readonly userDB: DBWithTracing<UserDB>,56@inject(PrebuildUpdater) private readonly prebuildUpdater: PrebuildUpdater,57@inject(IAnalyticsWriter) private readonly analytics: IAnalyticsWriter,58@inject(RedisPublisher) private readonly publisher: RedisPublisher,59) {}6061protected readonly disposables = new DisposableCollection();6263start(64workspaceClusterName: string,65clientProvider: ClientProvider,66controllerIntervalSeconds: number,67controllerMaxDisconnectSeconds: number,68) {69let disconnectStarted = Number.MAX_SAFE_INTEGER;70this.disposables.push(71repeat(async () => {72const span = TraceContext.startSpan("controlInstances");73const ctx = { span };74try {75log.debug("Controlling instances...", { workspaceClusterName });7677const nonStoppedInstances = await this.workspaceDB78.trace(ctx)79.findRunningInstancesWithWorkspaces(workspaceClusterName, undefined, true);8081// Control running workspace instances against ws-manager82try {83await this.controlNonStoppedWSManagerManagedInstances(84ctx,85workspaceClusterName,86nonStoppedInstances,87clientProvider,88this.config.timeouts.pendingPhaseSeconds,89this.config.timeouts.stoppingPhaseSeconds,90);9192disconnectStarted = Number.MAX_SAFE_INTEGER; // Reset disconnect period93} catch (err) {94if (durationLongerThanSeconds(disconnectStarted, controllerMaxDisconnectSeconds)) {95log.warn("Error while controlling workspace cluster's workspaces", err, {96workspaceClusterName,97});98} else if (disconnectStarted > Date.now()) {99disconnectStarted = Date.now();100}101}102103// Control workspace instances against timeouts104await this.controlNotStoppedAppClusterManagedInstanceTimeouts(105ctx,106nonStoppedInstances,107workspaceClusterName,108);109110log.debug("Done controlling instances.", { workspaceClusterName });111} catch (err) {112TraceContext.setError(ctx, err);113log.error("Error while controlling workspace cluster's workspaces", err, {114workspaceClusterName,115});116} finally {117span.finish();118}119}, controllerIntervalSeconds * 1000),120);121}122123/**124* This methods controls all instances that we have currently marked as "running" in the DB.125* It checks whether they are still running with their respective ws-manager, and if not, marks them as stopped in the DB.126*/127protected async controlNonStoppedWSManagerManagedInstances(128parentCtx: TraceContext,129workspaceClusterName: string,130runningInstances: RunningWorkspaceInfo[],131clientProvider: ClientProvider,132pendingPhaseSeconds: number,133stoppingPhaseSeconds: number,134) {135const span = TraceContext.startSpan("controlNonStoppedWSManagerManagedInstances", parentCtx);136const ctx = { span };137try {138log.debug("Controlling ws-manager managed instances...", { workspaceClusterName });139140const runningInstancesIdx = new Map<string, RunningWorkspaceInfo>();141runningInstances.forEach((i) => runningInstancesIdx.set(i.latestInstance.id, i));142143const client = await clientProvider();144const actuallyRunningInstances = await client.getWorkspaces(ctx, new GetWorkspacesRequest());145actuallyRunningInstances.getStatusList().forEach((s) => runningInstancesIdx.delete(s.getId()));146147// runningInstancesIdx only contains instances that ws-manager is not aware of148for (const [instanceId, ri] of runningInstancesIdx.entries()) {149const instance = ri.latestInstance;150const phase = instance.status.phase;151152// When ws-manager is not aware of the following instances outside of the timeout duration,153// they should be marked as stopped.154// pending states timeout is 1 hour after creationTime.155// stopping states timeout is 1 hour after stoppingTime.156if (157phase === "running" ||158(phase === "pending" &&159durationLongerThanSeconds(Date.parse(instance.creationTime), pendingPhaseSeconds)) ||160(phase === "stopping" &&161instance.stoppingTime &&162durationLongerThanSeconds(Date.parse(instance.stoppingTime), stoppingPhaseSeconds))163) {164log.info(165{ instanceId, workspaceId: instance.workspaceId },166"Database says the instance is present, but ws-man does not know about it. Marking as stopped in database.",167{ workspaceClusterName, phase },168);169await this.markWorkspaceInstanceAsStopped(ctx, ri, new Date());170continue;171}172173log.debug({ instanceId }, "Skipping instance", {174phase: phase,175creationTime: instance.creationTime,176region: instance.region,177});178}179180log.debug("Done controlling ws-manager managed instances.", { workspaceClusterName });181} catch (err) {182TraceContext.setError(ctx, err);183throw err; // required by caller184}185}186187/**188* This methods controls all instances of this installation during periods where ws-manager does not control them, but we have them in our DB.189* These currently are:190* - preparing191* - building192* It also covers these phases, as fallback, when - for whatever reason - we no longer receive updates from ws-manager.193* - unknown (fallback)194*/195async controlNotStoppedAppClusterManagedInstanceTimeouts(196parentCtx: TraceContext,197runningInstances: RunningWorkspaceInfo[],198applicationClusterName: string,199) {200const span = TraceContext.startSpan("controlNotStoppedAppClusterManagedInstanceTimeouts", parentCtx);201const ctx = { span };202try {203log.debug("Controlling app cluster managed instances...", { applicationClusterName });204205await Promise.all(206runningInstances.map((info) => this.controlNotStoppedAppClusterManagedInstance(ctx, info)),207);208209log.debug("Done controlling app cluster managed instances.", { applicationClusterName });210} catch (err) {211log.error("Error while controlling app cluster managed instances:", err, {212applicationClusterName,213});214TraceContext.setError(ctx, err);215} finally {216span.finish();217}218}219220protected async controlNotStoppedAppClusterManagedInstance(parentCtx: TraceContext, info: RunningWorkspaceInfo) {221const logContext: LogContext = {222userId: info.workspace.ownerId,223workspaceId: info.workspace.id,224instanceId: info.latestInstance.id,225};226const ctx = TraceContext.childContext("controlNotStoppedAppClusterManagedInstance", parentCtx);227try {228const now = Date.now();229const creationTime = new Date(info.latestInstance.creationTime).getTime();230const timedOutInPreparing = now >= creationTime + this.config.timeouts.preparingPhaseSeconds * 1000;231const timedOutInBuilding = now >= creationTime + this.config.timeouts.buildingPhaseSeconds * 1000;232const timedOutInUnknown = now >= creationTime + this.config.timeouts.unknownPhaseSeconds * 1000;233const currentPhase = info.latestInstance.status.phase;234235log.debug(logContext, "Controller: Checking for instances in the DB to mark as stopped", {236creationTime,237timedOutInPreparing,238currentPhase,239});240241if (242(currentPhase === "preparing" && timedOutInPreparing) ||243(currentPhase === "building" && timedOutInBuilding) ||244(currentPhase === "unknown" && timedOutInUnknown)245) {246log.info(logContext, "Controller: Marking workspace instance as stopped", {247creationTime,248currentPhase,249});250await this.markWorkspaceInstanceAsStopped(ctx, info, new Date(now));251}252} catch (err) {253log.warn(logContext, "Controller: Error while marking workspace instance as stopped", err);254TraceContext.setError(ctx, err);255} finally {256ctx.span.finish();257}258}259260async markWorkspaceInstanceAsStopped(ctx: TraceContext, info: RunningWorkspaceInfo, now: Date) {261const nowISO = now.toISOString();262if (!info.latestInstance.stoppingTime) {263info.latestInstance.stoppingTime = nowISO;264}265info.latestInstance.stoppedTime = nowISO;266info.latestInstance.status.message = `Stopped by ws-manager-bridge. Previously in phase ${info.latestInstance.status.phase}`;267this.prometheusExporter.increaseInstanceMarkedStoppedCounter(info.latestInstance.status.phase);268info.latestInstance.status.phase = "stopped";269await this.workspaceDB.trace(ctx).storeInstance(info.latestInstance);270271// cleanup272// important: call this after the DB update273await this.onStopped(ctx, info.workspace.ownerId, info.latestInstance);274275await this.publisher.publishInstanceUpdate({276ownerID: info.workspace.ownerId,277instanceID: info.latestInstance.id,278workspaceID: info.workspace.id,279});280281await this.prebuildUpdater.stopPrebuildInstance(ctx, info.latestInstance);282}283284async onStopped(ctx: TraceContext, ownerUserID: string, instance: WorkspaceInstance): Promise<void> {285const span = TraceContext.startSpan("onInstanceStopped", ctx);286287try {288await this.userDB.trace({ span }).deleteGitpodTokensNamedLike(ownerUserID, `${instance.id}-%`);289// Scrub properties that might contain sensitive data like URLs290const scrubbedProperties = scrubber.scrub({291instanceId: instance.id,292workspaceId: instance.workspaceId,293stoppingTime: new Date(instance.stoppingTime!),294conditions: instance.status.conditions,295timeout: instance.status.timeout,296});297298this.analytics.track({299userId: ownerUserID,300event: "workspace_stopped",301messageId: `bridge-wsstopped-${instance.id}`,302properties: scrubbedProperties,303timestamp: new Date(instance.stoppedTime!),304});305} catch (err) {306TraceContext.setError({ span }, err);307throw err;308} finally {309span.finish();310}311}312313public dispose() {314this.disposables.dispose();315}316}317318319