Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/ws-manager-bridge/src/workspace-instance-controller.ts
2498 views
1
/**
2
* Copyright (c) 2022 Gitpod GmbH. All rights reserved.
3
* Licensed under the GNU Affero General Public License (AGPL).
4
* See License.AGPL.txt in the project root for license information.
5
*/
6
7
import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing";
8
import { GetWorkspacesRequest } from "@gitpod/ws-manager/lib";
9
import { Disposable, DisposableCollection, RunningWorkspaceInfo, WorkspaceInstance } from "@gitpod/gitpod-protocol";
10
import { inject, injectable } from "inversify";
11
import { Configuration } from "./config";
12
import { log, LogContext } from "@gitpod/gitpod-protocol/lib/util/logging";
13
import { Metrics } from "./metrics";
14
import { WorkspaceDB } from "@gitpod/gitpod-db/lib/workspace-db";
15
import { DBWithTracing, TracedUserDB, TracedWorkspaceDB } from "@gitpod/gitpod-db/lib/traced-db";
16
import { UserDB } from "@gitpod/gitpod-db/lib/user-db";
17
import { IAnalyticsWriter } from "@gitpod/gitpod-protocol/lib/analytics";
18
import { ClientProvider } from "./wsman-subscriber";
19
import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat";
20
import { PrebuildUpdater } from "./prebuild-updater";
21
import { RedisPublisher } from "@gitpod/gitpod-db/lib";
22
import { durationLongerThanSeconds } from "@gitpod/gitpod-protocol/lib/util/timeutil";
23
import { scrubber } from "@gitpod/gitpod-protocol/lib/util/scrubbing";
24
25
export const WorkspaceInstanceController = Symbol("WorkspaceInstanceController");
26
27
export interface WorkspaceInstanceController extends Disposable {
28
start(
29
workspaceClusterName: string,
30
clientProvider: ClientProvider,
31
controllerIntervalSeconds: number,
32
controllerMaxDisconnectSeconds: number,
33
): void;
34
35
controlNotStoppedAppClusterManagedInstanceTimeouts(
36
parentCtx: TraceContext,
37
runningInstances: RunningWorkspaceInfo[],
38
workspaceClusterName: string,
39
): Promise<void>;
40
41
onStopped(ctx: TraceContext, ownerUserID: string, instance: WorkspaceInstance): Promise<void>;
42
}
43
44
/**
45
* This class is responsible for controlling the WorkspaceInstances that are not stopped and to ensure that there
46
* actual state is properly reflected in the database, eventually.
47
*
48
* !!! It's statful, so make sure it's bound in transient mode !!!
49
*/
50
@injectable()
51
export class WorkspaceInstanceControllerImpl implements WorkspaceInstanceController, Disposable {
52
constructor(
53
@inject(Configuration) private readonly config: Configuration,
54
@inject(Metrics) private readonly prometheusExporter: Metrics,
55
@inject(TracedWorkspaceDB) private readonly workspaceDB: DBWithTracing<WorkspaceDB>,
56
@inject(TracedUserDB) private readonly userDB: DBWithTracing<UserDB>,
57
@inject(PrebuildUpdater) private readonly prebuildUpdater: PrebuildUpdater,
58
@inject(IAnalyticsWriter) private readonly analytics: IAnalyticsWriter,
59
@inject(RedisPublisher) private readonly publisher: RedisPublisher,
60
) {}
61
62
protected readonly disposables = new DisposableCollection();
63
64
start(
65
workspaceClusterName: string,
66
clientProvider: ClientProvider,
67
controllerIntervalSeconds: number,
68
controllerMaxDisconnectSeconds: number,
69
) {
70
let disconnectStarted = Number.MAX_SAFE_INTEGER;
71
this.disposables.push(
72
repeat(async () => {
73
const span = TraceContext.startSpan("controlInstances");
74
const ctx = { span };
75
try {
76
log.debug("Controlling instances...", { workspaceClusterName });
77
78
const nonStoppedInstances = await this.workspaceDB
79
.trace(ctx)
80
.findRunningInstancesWithWorkspaces(workspaceClusterName, undefined, true);
81
82
// Control running workspace instances against ws-manager
83
try {
84
await this.controlNonStoppedWSManagerManagedInstances(
85
ctx,
86
workspaceClusterName,
87
nonStoppedInstances,
88
clientProvider,
89
this.config.timeouts.pendingPhaseSeconds,
90
this.config.timeouts.stoppingPhaseSeconds,
91
);
92
93
disconnectStarted = Number.MAX_SAFE_INTEGER; // Reset disconnect period
94
} catch (err) {
95
if (durationLongerThanSeconds(disconnectStarted, controllerMaxDisconnectSeconds)) {
96
log.warn("Error while controlling workspace cluster's workspaces", err, {
97
workspaceClusterName,
98
});
99
} else if (disconnectStarted > Date.now()) {
100
disconnectStarted = Date.now();
101
}
102
}
103
104
// Control workspace instances against timeouts
105
await this.controlNotStoppedAppClusterManagedInstanceTimeouts(
106
ctx,
107
nonStoppedInstances,
108
workspaceClusterName,
109
);
110
111
log.debug("Done controlling instances.", { workspaceClusterName });
112
} catch (err) {
113
TraceContext.setError(ctx, err);
114
log.error("Error while controlling workspace cluster's workspaces", err, {
115
workspaceClusterName,
116
});
117
} finally {
118
span.finish();
119
}
120
}, controllerIntervalSeconds * 1000),
121
);
122
}
123
124
/**
125
* This methods controls all instances that we have currently marked as "running" in the DB.
126
* It checks whether they are still running with their respective ws-manager, and if not, marks them as stopped in the DB.
127
*/
128
protected async controlNonStoppedWSManagerManagedInstances(
129
parentCtx: TraceContext,
130
workspaceClusterName: string,
131
runningInstances: RunningWorkspaceInfo[],
132
clientProvider: ClientProvider,
133
pendingPhaseSeconds: number,
134
stoppingPhaseSeconds: number,
135
) {
136
const span = TraceContext.startSpan("controlNonStoppedWSManagerManagedInstances", parentCtx);
137
const ctx = { span };
138
try {
139
log.debug("Controlling ws-manager managed instances...", { workspaceClusterName });
140
141
const runningInstancesIdx = new Map<string, RunningWorkspaceInfo>();
142
runningInstances.forEach((i) => runningInstancesIdx.set(i.latestInstance.id, i));
143
144
const client = await clientProvider();
145
const actuallyRunningInstances = await client.getWorkspaces(ctx, new GetWorkspacesRequest());
146
actuallyRunningInstances.getStatusList().forEach((s) => runningInstancesIdx.delete(s.getId()));
147
148
// runningInstancesIdx only contains instances that ws-manager is not aware of
149
for (const [instanceId, ri] of runningInstancesIdx.entries()) {
150
const instance = ri.latestInstance;
151
const phase = instance.status.phase;
152
153
// When ws-manager is not aware of the following instances outside of the timeout duration,
154
// they should be marked as stopped.
155
// pending states timeout is 1 hour after creationTime.
156
// stopping states timeout is 1 hour after stoppingTime.
157
if (
158
phase === "running" ||
159
(phase === "pending" &&
160
durationLongerThanSeconds(Date.parse(instance.creationTime), pendingPhaseSeconds)) ||
161
(phase === "stopping" &&
162
instance.stoppingTime &&
163
durationLongerThanSeconds(Date.parse(instance.stoppingTime), stoppingPhaseSeconds))
164
) {
165
log.info(
166
{ instanceId, workspaceId: instance.workspaceId },
167
"Database says the instance is present, but ws-man does not know about it. Marking as stopped in database.",
168
{ workspaceClusterName, phase },
169
);
170
await this.markWorkspaceInstanceAsStopped(ctx, ri, new Date());
171
continue;
172
}
173
174
log.debug({ instanceId }, "Skipping instance", {
175
phase: phase,
176
creationTime: instance.creationTime,
177
region: instance.region,
178
});
179
}
180
181
log.debug("Done controlling ws-manager managed instances.", { workspaceClusterName });
182
} catch (err) {
183
TraceContext.setError(ctx, err);
184
throw err; // required by caller
185
}
186
}
187
188
/**
189
* This methods controls all instances of this installation during periods where ws-manager does not control them, but we have them in our DB.
190
* These currently are:
191
* - preparing
192
* - building
193
* It also covers these phases, as fallback, when - for whatever reason - we no longer receive updates from ws-manager.
194
* - unknown (fallback)
195
*/
196
async controlNotStoppedAppClusterManagedInstanceTimeouts(
197
parentCtx: TraceContext,
198
runningInstances: RunningWorkspaceInfo[],
199
applicationClusterName: string,
200
) {
201
const span = TraceContext.startSpan("controlNotStoppedAppClusterManagedInstanceTimeouts", parentCtx);
202
const ctx = { span };
203
try {
204
log.debug("Controlling app cluster managed instances...", { applicationClusterName });
205
206
await Promise.all(
207
runningInstances.map((info) => this.controlNotStoppedAppClusterManagedInstance(ctx, info)),
208
);
209
210
log.debug("Done controlling app cluster managed instances.", { applicationClusterName });
211
} catch (err) {
212
log.error("Error while controlling app cluster managed instances:", err, {
213
applicationClusterName,
214
});
215
TraceContext.setError(ctx, err);
216
} finally {
217
span.finish();
218
}
219
}
220
221
protected async controlNotStoppedAppClusterManagedInstance(parentCtx: TraceContext, info: RunningWorkspaceInfo) {
222
const logContext: LogContext = {
223
userId: info.workspace.ownerId,
224
workspaceId: info.workspace.id,
225
instanceId: info.latestInstance.id,
226
};
227
const ctx = TraceContext.childContext("controlNotStoppedAppClusterManagedInstance", parentCtx);
228
try {
229
const now = Date.now();
230
const creationTime = new Date(info.latestInstance.creationTime).getTime();
231
const timedOutInPreparing = now >= creationTime + this.config.timeouts.preparingPhaseSeconds * 1000;
232
const timedOutInBuilding = now >= creationTime + this.config.timeouts.buildingPhaseSeconds * 1000;
233
const timedOutInUnknown = now >= creationTime + this.config.timeouts.unknownPhaseSeconds * 1000;
234
const currentPhase = info.latestInstance.status.phase;
235
236
log.debug(logContext, "Controller: Checking for instances in the DB to mark as stopped", {
237
creationTime,
238
timedOutInPreparing,
239
currentPhase,
240
});
241
242
if (
243
(currentPhase === "preparing" && timedOutInPreparing) ||
244
(currentPhase === "building" && timedOutInBuilding) ||
245
(currentPhase === "unknown" && timedOutInUnknown)
246
) {
247
log.info(logContext, "Controller: Marking workspace instance as stopped", {
248
creationTime,
249
currentPhase,
250
});
251
await this.markWorkspaceInstanceAsStopped(ctx, info, new Date(now));
252
}
253
} catch (err) {
254
log.warn(logContext, "Controller: Error while marking workspace instance as stopped", err);
255
TraceContext.setError(ctx, err);
256
} finally {
257
ctx.span.finish();
258
}
259
}
260
261
async markWorkspaceInstanceAsStopped(ctx: TraceContext, info: RunningWorkspaceInfo, now: Date) {
262
const nowISO = now.toISOString();
263
if (!info.latestInstance.stoppingTime) {
264
info.latestInstance.stoppingTime = nowISO;
265
}
266
info.latestInstance.stoppedTime = nowISO;
267
info.latestInstance.status.message = `Stopped by ws-manager-bridge. Previously in phase ${info.latestInstance.status.phase}`;
268
this.prometheusExporter.increaseInstanceMarkedStoppedCounter(info.latestInstance.status.phase);
269
info.latestInstance.status.phase = "stopped";
270
await this.workspaceDB.trace(ctx).storeInstance(info.latestInstance);
271
272
// cleanup
273
// important: call this after the DB update
274
await this.onStopped(ctx, info.workspace.ownerId, info.latestInstance);
275
276
await this.publisher.publishInstanceUpdate({
277
ownerID: info.workspace.ownerId,
278
instanceID: info.latestInstance.id,
279
workspaceID: info.workspace.id,
280
});
281
282
await this.prebuildUpdater.stopPrebuildInstance(ctx, info.latestInstance);
283
}
284
285
async onStopped(ctx: TraceContext, ownerUserID: string, instance: WorkspaceInstance): Promise<void> {
286
const span = TraceContext.startSpan("onInstanceStopped", ctx);
287
288
try {
289
await this.userDB.trace({ span }).deleteGitpodTokensNamedLike(ownerUserID, `${instance.id}-%`);
290
// Scrub properties that might contain sensitive data like URLs
291
const scrubbedProperties = scrubber.scrub({
292
instanceId: instance.id,
293
workspaceId: instance.workspaceId,
294
stoppingTime: new Date(instance.stoppingTime!),
295
conditions: instance.status.conditions,
296
timeout: instance.status.timeout,
297
});
298
299
this.analytics.track({
300
userId: ownerUserID,
301
event: "workspace_stopped",
302
messageId: `bridge-wsstopped-${instance.id}`,
303
properties: scrubbedProperties,
304
timestamp: new Date(instance.stoppedTime!),
305
});
306
} catch (err) {
307
TraceContext.setError({ span }, err);
308
throw err;
309
} finally {
310
span.finish();
311
}
312
}
313
314
public dispose() {
315
this.disposables.dispose();
316
}
317
}
318
319