CoCalc -- tsync.c

GitHub Repository: torvalds/linux
Path: blob/master/security/landlock/tsync.c
¹²¹⁷⁹⁷ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Landlock - Cross-thread ruleset enforcement
4
 *
5
 * Copyright © 2025 Google LLC
6
 */
7

8
#include <linux/atomic.h>
9
#include <linux/cleanup.h>
10
#include <linux/completion.h>
11
#include <linux/cred.h>
12
#include <linux/errno.h>
13
#include <linux/overflow.h>
14
#include <linux/rcupdate.h>
15
#include <linux/sched.h>
16
#include <linux/sched/signal.h>
17
#include <linux/sched/task.h>
18
#include <linux/slab.h>
19
#include <linux/task_work.h>
20

21
#include "cred.h"
22
#include "tsync.h"
23

24
/*
25
 * Shared state between multiple threads which are enforcing Landlock rulesets
26
 * in lockstep with each other.
27
 */
28
struct tsync_shared_context {
29
	/* The old and tentative new creds of the calling thread. */
30
	const struct cred *old_cred;
31
	const struct cred *new_cred;
32

33
	/* True if sibling tasks need to set the no_new_privs flag. */
34
	bool set_no_new_privs;
35

36
	/* An error encountered in preparation step, or 0. */
37
	atomic_t preparation_error;
38

39
	/*
40
	 * Barrier after preparation step in restrict_one_thread.
41
	 * The calling thread waits for completion.
42
	 *
43
	 * Re-initialized on every round of looking for newly spawned threads.
44
	 */
45
	atomic_t num_preparing;
46
	struct completion all_prepared;
47

48
	/* Sibling threads wait for completion. */
49
	struct completion ready_to_commit;
50

51
	/*
52
	 * Barrier after commit step (used by syscall impl to wait for
53
	 * completion).
54
	 */
55
	atomic_t num_unfinished;
56
	struct completion all_finished;
57
};
58

59
struct tsync_work {
60
	struct callback_head work;
61
	struct task_struct *task;
62
	struct tsync_shared_context *shared_ctx;
63
};
64

65
/*
66
 * restrict_one_thread - update a thread's Landlock domain in lockstep with the
67
 * other threads in the same process
68
 *
69
 * When this is run, the same function gets run in all other threads in the same
70
 * process (except for the calling thread which called landlock_restrict_self).
71
 * The concurrently running invocations of restrict_one_thread coordinate
72
 * through the shared ctx object to do their work in lockstep to implement
73
 * all-or-nothing semantics for enforcing the new Landlock domain.
74
 *
75
 * Afterwards, depending on the presence of an error, all threads either commit
76
 * or abort the prepared credentials.  The commit operation can not fail any
77
 * more.
78
 */
79
static void restrict_one_thread(struct tsync_shared_context *ctx)
80
{
81
	int err;
82
	struct cred *cred = NULL;
83

84
	if (current_cred() == ctx->old_cred) {
85
		/*
86
		 * Switch out old_cred with new_cred, if possible.
87
		 *
88
		 * In the common case, where all threads initially point to the same
89
		 * struct cred, this optimization avoids creating separate redundant
90
		 * credentials objects for each, which would all have the same contents.
91
		 *
92
		 * Note: We are intentionally dropping the const qualifier here, because
93
		 * it is required by commit_creds() and abort_creds().
94
		 */
95
		cred = (struct cred *)get_cred(ctx->new_cred);
96
	} else {
97
		/* Else, prepare new creds and populate them. */
98
		cred = prepare_creds();
99

100
		if (!cred) {
101
			atomic_set(&ctx->preparation_error, -ENOMEM);
102

103
			/*
104
			 * Even on error, we need to adhere to the protocol and coordinate
105
			 * with concurrently running invocations.
106
			 */
107
			if (atomic_dec_return(&ctx->num_preparing) == 0)
108
				complete_all(&ctx->all_prepared);
109

110
			goto out;
111
		}
112

113
		landlock_cred_copy(landlock_cred(cred),
114
				   landlock_cred(ctx->new_cred));
115
	}
116

117
	/*
118
	 * Barrier: Wait until all threads are done preparing.
119
	 * After this point, we can have no more failures.
120
	 */
121
	if (atomic_dec_return(&ctx->num_preparing) == 0)
122
		complete_all(&ctx->all_prepared);
123

124
	/*
125
	 * Wait for signal from calling thread that it's safe to read the
126
	 * preparation error now and we are ready to commit (or abort).
127
	 */
128
	wait_for_completion(&ctx->ready_to_commit);
129

130
	/* Abort the commit if any of the other threads had an error. */
131
	err = atomic_read(&ctx->preparation_error);
132
	if (err) {
133
		abort_creds(cred);
134
		goto out;
135
	}
136

137
	/*
138
	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
139
	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
140
	 * kernel/seccomp.c)
141
	 */
142
	if (ctx->set_no_new_privs)
143
		task_set_no_new_privs(current);
144

145
	commit_creds(cred);
146

147
out:
148
	/* Notify the calling thread once all threads are done */
149
	if (atomic_dec_return(&ctx->num_unfinished) == 0)
150
		complete_all(&ctx->all_finished);
151
}
152

153
/*
154
 * restrict_one_thread_callback - task_work callback for restricting a thread
155
 *
156
 * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
157
 */
158
static void restrict_one_thread_callback(struct callback_head *work)
159
{
160
	struct tsync_work *ctx = container_of(work, struct tsync_work, work);
161

162
	restrict_one_thread(ctx->shared_ctx);
163
}
164

165
/*
166
 * struct tsync_works - a growable array of per-task contexts
167
 *
168
 * The zero-initialized struct represents the empty array.
169
 */
170
struct tsync_works {
171
	struct tsync_work **works;
172
	size_t size;
173
	size_t capacity;
174
};
175

176
/*
177
 * tsync_works_provide - provides a preallocated tsync_work for the given task
178
 *
179
 * This also stores a task pointer in the context and increments the reference
180
 * count of the task.
181
 *
182
 * This function may fail in the case where we did not preallocate sufficient
183
 * capacity.  This can legitimately happen if new threads get started after we
184
 * grew the capacity.
185
 *
186
 * Returns:
187
 *   A pointer to the preallocated context struct, with task filled in.
188
 *
189
 *   NULL, if we ran out of preallocated context structs.
190
 */
191
static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192
					      struct task_struct *task)
193
{
194
	struct tsync_work *ctx;
195

196
	if (s->size >= s->capacity)
197
		return NULL;
198

199
	ctx = s->works[s->size];
200
	s->size++;
201

202
	ctx->task = get_task_struct(task);
203
	return ctx;
204
}
205

206
/**
207
 * tsync_works_trim - Put the last tsync_work element
208
 *
209
 * @s: TSYNC works to trim.
210
 *
211
 * Put the last task and decrement the size of @s.
212
 *
213
 * This helper does not cancel a running task, but just reset the last element
214
 * to zero.
215
 */
216
static void tsync_works_trim(struct tsync_works *s)
217
{
218
	struct tsync_work *ctx;
219

220
	if (WARN_ON_ONCE(s->size <= 0))
221
		return;
222

223
	ctx = s->works[s->size - 1];
224

225
	/*
226
	 * For consistency, remove the task from ctx so that it does not look like
227
	 * we handed it a task_work.
228
	 */
229
	put_task_struct(ctx->task);
230
	*ctx = (typeof(*ctx)){};
231

232
	/*
233
	 * Cancel the tsync_works_provide() change to recycle the reserved memory
234
	 * for the next thread, if any.  This also ensures that cancel_tsync_works()
235
	 * and tsync_works_release() do not see any NULL task pointers.
236
	 */
237
	s->size--;
238
}
239

240
/*
241
 * tsync_works_grow_by - preallocates space for n more contexts in s
242
 *
243
 * On a successful return, the subsequent n calls to tsync_works_provide() are
244
 * guaranteed to succeed.  (size + n <= capacity)
245
 *
246
 * Returns:
247
 *   -ENOMEM if the (re)allocation fails
248

249
 *   0       if the allocation succeeds, partially succeeds, or no reallocation
250
 *           was needed
251
 */
252
static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
253
{
254
	size_t i;
255
	size_t new_capacity;
256
	struct tsync_work **works;
257
	struct tsync_work *work;
258

259
	if (check_add_overflow(s->size, n, &new_capacity))
260
		return -EOVERFLOW;
261

262
	/* No need to reallocate if s already has sufficient capacity. */
263
	if (new_capacity <= s->capacity)
264
		return 0;
265

266
	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
267
			       flags);
268
	if (!works)
269
		return -ENOMEM;
270

271
	s->works = works;
272

273
	for (i = s->capacity; i < new_capacity; i++) {
274
		work = kzalloc_obj(*work, flags);
275
		if (!work) {
276
			/*
277
			 * Leave the object in a consistent state,
278
			 * but return an error.
279
			 */
280
			s->capacity = i;
281
			return -ENOMEM;
282
		}
283
		s->works[i] = work;
284
	}
285
	s->capacity = new_capacity;
286
	return 0;
287
}
288

289
/*
290
 * tsync_works_contains - checks for presence of task in s
291
 */
292
static bool tsync_works_contains_task(const struct tsync_works *s,
293
				      const struct task_struct *task)
294
{
295
	size_t i;
296

297
	for (i = 0; i < s->size; i++)
298
		if (s->works[i]->task == task)
299
			return true;
300

301
	return false;
302
}
303

304
/*
305
 * tsync_works_release - frees memory held by s and drops all task references
306
 *
307
 * This does not free s itself, only the data structures held by it.
308
 */
309
static void tsync_works_release(struct tsync_works *s)
310
{
311
	size_t i;
312

313
	for (i = 0; i < s->size; i++) {
314
		if (WARN_ON_ONCE(!s->works[i]->task))
315
			continue;
316

317
		put_task_struct(s->works[i]->task);
318
	}
319

320
	for (i = 0; i < s->capacity; i++)
321
		kfree(s->works[i]);
322

323
	kfree(s->works);
324
	s->works = NULL;
325
	s->size = 0;
326
	s->capacity = 0;
327
}
328

329
/*
330
 * count_additional_threads - counts the sibling threads that are not in works
331
 */
332
static size_t count_additional_threads(const struct tsync_works *works)
333
{
334
	const struct task_struct *caller, *thread;
335
	size_t n = 0;
336

337
	caller = current;
338

339
	guard(rcu)();
340

341
	for_each_thread(caller, thread) {
342
		/* Skip current, since it is initiating the sync. */
343
		if (thread == caller)
344
			continue;
345

346
		/* Skip exited threads. */
347
		if (thread->flags & PF_EXITING)
348
			continue;
349

350
		/* Skip threads that we have already seen. */
351
		if (tsync_works_contains_task(works, thread))
352
			continue;
353

354
		n++;
355
	}
356
	return n;
357
}
358

359
/*
360
 * schedule_task_work - adds task_work for all eligible sibling threads
361
 *                      which have not been scheduled yet
362
 *
363
 * For each added task_work, atomically increments shared_ctx->num_preparing and
364
 * shared_ctx->num_unfinished.
365
 *
366
 * Returns:
367
 *     true, if at least one eligible sibling thread was found
368
 */
369
static bool schedule_task_work(struct tsync_works *works,
370
			       struct tsync_shared_context *shared_ctx)
371
{
372
	int err;
373
	const struct task_struct *caller;
374
	struct task_struct *thread;
375
	struct tsync_work *ctx;
376
	bool found_more_threads = false;
377

378
	caller = current;
379

380
	guard(rcu)();
381

382
	for_each_thread(caller, thread) {
383
		/* Skip current, since it is initiating the sync. */
384
		if (thread == caller)
385
			continue;
386

387
		/* Skip exited threads. */
388
		if (thread->flags & PF_EXITING)
389
			continue;
390

391
		/* Skip threads that we already looked at. */
392
		if (tsync_works_contains_task(works, thread))
393
			continue;
394

395
		/*
396
		 * We found a sibling thread that is not doing its task_work yet, and
397
		 * which might spawn new threads before our task work runs, so we need
398
		 * at least one more round in the outer loop.
399
		 */
400
		found_more_threads = true;
401

402
		ctx = tsync_works_provide(works, thread);
403
		if (!ctx) {
404
			/*
405
			 * We ran out of preallocated contexts -- we need to try again with
406
			 * this thread at a later time!
407
			 * found_more_threads is already true at this point.
408
			 */
409
			break;
410
		}
411

412
		ctx->shared_ctx = shared_ctx;
413

414
		atomic_inc(&shared_ctx->num_preparing);
415
		atomic_inc(&shared_ctx->num_unfinished);
416

417
		init_task_work(&ctx->work, restrict_one_thread_callback);
418
		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
419
		if (unlikely(err)) {
420
			/*
421
			 * task_work_add() only fails if the task is about to exit.  We
422
			 * checked that earlier, but it can happen as a race.  Resume
423
			 * without setting an error, as the task is probably gone in the
424
			 * next loop iteration.
425
			 */
426
			tsync_works_trim(works);
427

428
			atomic_dec(&shared_ctx->num_preparing);
429
			atomic_dec(&shared_ctx->num_unfinished);
430
		}
431
	}
432

433
	return found_more_threads;
434
}
435

436
/*
437
 * cancel_tsync_works - cancel all task works where it is possible
438
 *
439
 * Task works can be canceled as long as they are still queued and have not
440
 * started running.  If they get canceled, we decrement
441
 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
442
 * completions if needed, as if the task was never scheduled.
443
 */
444
static void cancel_tsync_works(const struct tsync_works *works,
445
			       struct tsync_shared_context *shared_ctx)
446
{
447
	size_t i;
448

449
	for (i = 0; i < works->size; i++) {
450
		if (WARN_ON_ONCE(!works->works[i]->task))
451
			continue;
452

453
		if (!task_work_cancel(works->works[i]->task,
454
				      &works->works[i]->work))
455
			continue;
456

457
		/* After dequeueing, act as if the task work had executed. */
458

459
		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
460
			complete_all(&shared_ctx->all_prepared);
461

462
		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
463
			complete_all(&shared_ctx->all_finished);
464
	}
465
}
466

467
/*
468
 * restrict_sibling_threads - enables a Landlock policy for all sibling threads
469
 */
470
int landlock_restrict_sibling_threads(const struct cred *old_cred,
471
				      const struct cred *new_cred)
472
{
473
	int err;
474
	struct tsync_shared_context shared_ctx;
475
	struct tsync_works works = {};
476
	size_t newly_discovered_threads;
477
	bool found_more_threads;
478

479
	atomic_set(&shared_ctx.preparation_error, 0);
480
	init_completion(&shared_ctx.all_prepared);
481
	init_completion(&shared_ctx.ready_to_commit);
482
	atomic_set(&shared_ctx.num_unfinished, 1);
483
	init_completion(&shared_ctx.all_finished);
484
	shared_ctx.old_cred = old_cred;
485
	shared_ctx.new_cred = new_cred;
486
	shared_ctx.set_no_new_privs = task_no_new_privs(current);
487

488
	/*
489
	 * Serialize concurrent TSYNC operations to prevent deadlocks when
490
	 * multiple threads call landlock_restrict_self() simultaneously.
491
	 * If the lock is already held, we gracefully yield by restarting the
492
	 * syscall. This allows the current thread to process pending
493
	 * task_works before retrying.
494
	 */
495
	if (!down_write_trylock(&current->signal->exec_update_lock))
496
		return restart_syscall();
497

498
	/*
499
	 * We schedule a pseudo-signal task_work for each of the calling task's
500
	 * sibling threads.  In the task work, each thread:
501
	 *
502
	 * 1) runs prepare_creds() and writes back the error to
503
	 *    shared_ctx.preparation_error, if needed.
504
	 *
505
	 * 2) signals that it's done with prepare_creds() to the calling task.
506
	 *    (completion "all_prepared").
507
	 *
508
	 * 3) waits for the completion "ready_to_commit".  This is sent by the
509
	 *    calling task after ensuring that all sibling threads have done
510
	 *    with the "preparation" stage.
511
	 *
512
	 *    After this barrier is reached, it's safe to read
513
	 *    shared_ctx.preparation_error.
514
	 *
515
	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
516
	 *    or abort_creds().
517
	 *
518
	 * 5) signals that it's done altogether (barrier synchronization
519
	 *    "all_finished")
520
	 *
521
	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
522
	 * acquire the cred_guard_mutex and sighand->siglock:
523
	 *
524
	 * - As in our case, all threads are themselves exchanging their own struct
525
	 *   cred through the credentials API, no locks are needed for that.
526
	 * - Our for_each_thread() loops are protected by RCU.
527
	 * - We do not acquire a lock to keep the list of sibling threads stable
528
	 *   between our for_each_thread loops.  If the list of available sibling
529
	 *   threads changes between these for_each_thread loops, we make up for
530
	 *   that by continuing to look for threads until they are all discovered
531
	 *   and have entered their task_work, where they are unable to spawn new
532
	 *   threads.
533
	 */
534
	do {
535
		/* In RCU read-lock, count the threads we need. */
536
		newly_discovered_threads = count_additional_threads(&works);
537

538
		if (newly_discovered_threads == 0)
539
			break; /* done */
540

541
		err = tsync_works_grow_by(&works, newly_discovered_threads,
542
					  GFP_KERNEL_ACCOUNT);
543
		if (err) {
544
			atomic_set(&shared_ctx.preparation_error, err);
545
			break;
546
		}
547

548
		/*
549
		 * The "all_prepared" barrier is used locally to the loop body, this use
550
		 * of for_each_thread().  We can reset it on each loop iteration because
551
		 * all previous loop iterations are done with it already.
552
		 *
553
		 * num_preparing is initialized to 1 so that the counter can not go to 0
554
		 * and mark the completion as done before all task works are registered.
555
		 * We decrement it at the end of the loop body.
556
		 */
557
		atomic_set(&shared_ctx.num_preparing, 1);
558
		reinit_completion(&shared_ctx.all_prepared);
559

560
		/*
561
		 * In RCU read-lock, schedule task work on newly discovered sibling
562
		 * tasks.
563
		 */
564
		found_more_threads = schedule_task_work(&works, &shared_ctx);
565

566
		/*
567
		 * Decrement num_preparing for current, to undo that we initialized it
568
		 * to 1 a few lines above.
569
		 */
570
		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
571
			if (wait_for_completion_interruptible(
572
				    &shared_ctx.all_prepared)) {
573
				/* In case of interruption, we need to retry the system call. */
574
				atomic_set(&shared_ctx.preparation_error,
575
					   -ERESTARTNOINTR);
576

577
				/*
578
				 * Opportunistic improvement: try to cancel task
579
				 * works for tasks that did not start running
580
				 * yet. We do not have a guarantee that it
581
				 * cancels any of the enqueued task works
582
				 * because task_work_run() might already have
583
				 * dequeued them.
584
				 */
585
				cancel_tsync_works(&works, &shared_ctx);
586

587
				/*
588
				 * Break the loop with error. The cleanup code
589
				 * after the loop unblocks the remaining
590
				 * task_works.
591
				 */
592
				break;
593
			}
594
		}
595
	} while (found_more_threads &&
596
		 !atomic_read(&shared_ctx.preparation_error));
597

598
	/*
599
	 * We now have either (a) all sibling threads blocking and in "prepared"
600
	 * state in the task work, or (b) the preparation error is set. Ask all
601
	 * threads to commit (or abort).
602
	 */
603
	complete_all(&shared_ctx.ready_to_commit);
604

605
	/*
606
	 * Decrement num_unfinished for current, to undo that we initialized it to 1
607
	 * at the beginning.
608
	 */
609
	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
610
		wait_for_completion(&shared_ctx.all_finished);
611

612
	tsync_works_release(&works);
613
	up_write(&current->signal->exec_update_lock);
614
	return atomic_read(&shared_ctx.preparation_error);
615
}
616

617
Product

Resources

Company