Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/security/landlock/tsync.c
121797 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Landlock - Cross-thread ruleset enforcement
4
*
5
* Copyright © 2025 Google LLC
6
*/
7
8
#include <linux/atomic.h>
9
#include <linux/cleanup.h>
10
#include <linux/completion.h>
11
#include <linux/cred.h>
12
#include <linux/errno.h>
13
#include <linux/overflow.h>
14
#include <linux/rcupdate.h>
15
#include <linux/sched.h>
16
#include <linux/sched/signal.h>
17
#include <linux/sched/task.h>
18
#include <linux/slab.h>
19
#include <linux/task_work.h>
20
21
#include "cred.h"
22
#include "tsync.h"
23
24
/*
25
* Shared state between multiple threads which are enforcing Landlock rulesets
26
* in lockstep with each other.
27
*/
28
struct tsync_shared_context {
29
/* The old and tentative new creds of the calling thread. */
30
const struct cred *old_cred;
31
const struct cred *new_cred;
32
33
/* True if sibling tasks need to set the no_new_privs flag. */
34
bool set_no_new_privs;
35
36
/* An error encountered in preparation step, or 0. */
37
atomic_t preparation_error;
38
39
/*
40
* Barrier after preparation step in restrict_one_thread.
41
* The calling thread waits for completion.
42
*
43
* Re-initialized on every round of looking for newly spawned threads.
44
*/
45
atomic_t num_preparing;
46
struct completion all_prepared;
47
48
/* Sibling threads wait for completion. */
49
struct completion ready_to_commit;
50
51
/*
52
* Barrier after commit step (used by syscall impl to wait for
53
* completion).
54
*/
55
atomic_t num_unfinished;
56
struct completion all_finished;
57
};
58
59
struct tsync_work {
60
struct callback_head work;
61
struct task_struct *task;
62
struct tsync_shared_context *shared_ctx;
63
};
64
65
/*
66
* restrict_one_thread - update a thread's Landlock domain in lockstep with the
67
* other threads in the same process
68
*
69
* When this is run, the same function gets run in all other threads in the same
70
* process (except for the calling thread which called landlock_restrict_self).
71
* The concurrently running invocations of restrict_one_thread coordinate
72
* through the shared ctx object to do their work in lockstep to implement
73
* all-or-nothing semantics for enforcing the new Landlock domain.
74
*
75
* Afterwards, depending on the presence of an error, all threads either commit
76
* or abort the prepared credentials. The commit operation can not fail any
77
* more.
78
*/
79
static void restrict_one_thread(struct tsync_shared_context *ctx)
80
{
81
int err;
82
struct cred *cred = NULL;
83
84
if (current_cred() == ctx->old_cred) {
85
/*
86
* Switch out old_cred with new_cred, if possible.
87
*
88
* In the common case, where all threads initially point to the same
89
* struct cred, this optimization avoids creating separate redundant
90
* credentials objects for each, which would all have the same contents.
91
*
92
* Note: We are intentionally dropping the const qualifier here, because
93
* it is required by commit_creds() and abort_creds().
94
*/
95
cred = (struct cred *)get_cred(ctx->new_cred);
96
} else {
97
/* Else, prepare new creds and populate them. */
98
cred = prepare_creds();
99
100
if (!cred) {
101
atomic_set(&ctx->preparation_error, -ENOMEM);
102
103
/*
104
* Even on error, we need to adhere to the protocol and coordinate
105
* with concurrently running invocations.
106
*/
107
if (atomic_dec_return(&ctx->num_preparing) == 0)
108
complete_all(&ctx->all_prepared);
109
110
goto out;
111
}
112
113
landlock_cred_copy(landlock_cred(cred),
114
landlock_cred(ctx->new_cred));
115
}
116
117
/*
118
* Barrier: Wait until all threads are done preparing.
119
* After this point, we can have no more failures.
120
*/
121
if (atomic_dec_return(&ctx->num_preparing) == 0)
122
complete_all(&ctx->all_prepared);
123
124
/*
125
* Wait for signal from calling thread that it's safe to read the
126
* preparation error now and we are ready to commit (or abort).
127
*/
128
wait_for_completion(&ctx->ready_to_commit);
129
130
/* Abort the commit if any of the other threads had an error. */
131
err = atomic_read(&ctx->preparation_error);
132
if (err) {
133
abort_creds(cred);
134
goto out;
135
}
136
137
/*
138
* Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
139
* (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
140
* kernel/seccomp.c)
141
*/
142
if (ctx->set_no_new_privs)
143
task_set_no_new_privs(current);
144
145
commit_creds(cred);
146
147
out:
148
/* Notify the calling thread once all threads are done */
149
if (atomic_dec_return(&ctx->num_unfinished) == 0)
150
complete_all(&ctx->all_finished);
151
}
152
153
/*
154
* restrict_one_thread_callback - task_work callback for restricting a thread
155
*
156
* Calls restrict_one_thread with the struct landlock_shared_tsync_context.
157
*/
158
static void restrict_one_thread_callback(struct callback_head *work)
159
{
160
struct tsync_work *ctx = container_of(work, struct tsync_work, work);
161
162
restrict_one_thread(ctx->shared_ctx);
163
}
164
165
/*
166
* struct tsync_works - a growable array of per-task contexts
167
*
168
* The zero-initialized struct represents the empty array.
169
*/
170
struct tsync_works {
171
struct tsync_work **works;
172
size_t size;
173
size_t capacity;
174
};
175
176
/*
177
* tsync_works_provide - provides a preallocated tsync_work for the given task
178
*
179
* This also stores a task pointer in the context and increments the reference
180
* count of the task.
181
*
182
* This function may fail in the case where we did not preallocate sufficient
183
* capacity. This can legitimately happen if new threads get started after we
184
* grew the capacity.
185
*
186
* Returns:
187
* A pointer to the preallocated context struct, with task filled in.
188
*
189
* NULL, if we ran out of preallocated context structs.
190
*/
191
static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192
struct task_struct *task)
193
{
194
struct tsync_work *ctx;
195
196
if (s->size >= s->capacity)
197
return NULL;
198
199
ctx = s->works[s->size];
200
s->size++;
201
202
ctx->task = get_task_struct(task);
203
return ctx;
204
}
205
206
/**
207
* tsync_works_trim - Put the last tsync_work element
208
*
209
* @s: TSYNC works to trim.
210
*
211
* Put the last task and decrement the size of @s.
212
*
213
* This helper does not cancel a running task, but just reset the last element
214
* to zero.
215
*/
216
static void tsync_works_trim(struct tsync_works *s)
217
{
218
struct tsync_work *ctx;
219
220
if (WARN_ON_ONCE(s->size <= 0))
221
return;
222
223
ctx = s->works[s->size - 1];
224
225
/*
226
* For consistency, remove the task from ctx so that it does not look like
227
* we handed it a task_work.
228
*/
229
put_task_struct(ctx->task);
230
*ctx = (typeof(*ctx)){};
231
232
/*
233
* Cancel the tsync_works_provide() change to recycle the reserved memory
234
* for the next thread, if any. This also ensures that cancel_tsync_works()
235
* and tsync_works_release() do not see any NULL task pointers.
236
*/
237
s->size--;
238
}
239
240
/*
241
* tsync_works_grow_by - preallocates space for n more contexts in s
242
*
243
* On a successful return, the subsequent n calls to tsync_works_provide() are
244
* guaranteed to succeed. (size + n <= capacity)
245
*
246
* Returns:
247
* -ENOMEM if the (re)allocation fails
248
249
* 0 if the allocation succeeds, partially succeeds, or no reallocation
250
* was needed
251
*/
252
static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
253
{
254
size_t i;
255
size_t new_capacity;
256
struct tsync_work **works;
257
struct tsync_work *work;
258
259
if (check_add_overflow(s->size, n, &new_capacity))
260
return -EOVERFLOW;
261
262
/* No need to reallocate if s already has sufficient capacity. */
263
if (new_capacity <= s->capacity)
264
return 0;
265
266
works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
267
flags);
268
if (!works)
269
return -ENOMEM;
270
271
s->works = works;
272
273
for (i = s->capacity; i < new_capacity; i++) {
274
work = kzalloc_obj(*work, flags);
275
if (!work) {
276
/*
277
* Leave the object in a consistent state,
278
* but return an error.
279
*/
280
s->capacity = i;
281
return -ENOMEM;
282
}
283
s->works[i] = work;
284
}
285
s->capacity = new_capacity;
286
return 0;
287
}
288
289
/*
290
* tsync_works_contains - checks for presence of task in s
291
*/
292
static bool tsync_works_contains_task(const struct tsync_works *s,
293
const struct task_struct *task)
294
{
295
size_t i;
296
297
for (i = 0; i < s->size; i++)
298
if (s->works[i]->task == task)
299
return true;
300
301
return false;
302
}
303
304
/*
305
* tsync_works_release - frees memory held by s and drops all task references
306
*
307
* This does not free s itself, only the data structures held by it.
308
*/
309
static void tsync_works_release(struct tsync_works *s)
310
{
311
size_t i;
312
313
for (i = 0; i < s->size; i++) {
314
if (WARN_ON_ONCE(!s->works[i]->task))
315
continue;
316
317
put_task_struct(s->works[i]->task);
318
}
319
320
for (i = 0; i < s->capacity; i++)
321
kfree(s->works[i]);
322
323
kfree(s->works);
324
s->works = NULL;
325
s->size = 0;
326
s->capacity = 0;
327
}
328
329
/*
330
* count_additional_threads - counts the sibling threads that are not in works
331
*/
332
static size_t count_additional_threads(const struct tsync_works *works)
333
{
334
const struct task_struct *caller, *thread;
335
size_t n = 0;
336
337
caller = current;
338
339
guard(rcu)();
340
341
for_each_thread(caller, thread) {
342
/* Skip current, since it is initiating the sync. */
343
if (thread == caller)
344
continue;
345
346
/* Skip exited threads. */
347
if (thread->flags & PF_EXITING)
348
continue;
349
350
/* Skip threads that we have already seen. */
351
if (tsync_works_contains_task(works, thread))
352
continue;
353
354
n++;
355
}
356
return n;
357
}
358
359
/*
360
* schedule_task_work - adds task_work for all eligible sibling threads
361
* which have not been scheduled yet
362
*
363
* For each added task_work, atomically increments shared_ctx->num_preparing and
364
* shared_ctx->num_unfinished.
365
*
366
* Returns:
367
* true, if at least one eligible sibling thread was found
368
*/
369
static bool schedule_task_work(struct tsync_works *works,
370
struct tsync_shared_context *shared_ctx)
371
{
372
int err;
373
const struct task_struct *caller;
374
struct task_struct *thread;
375
struct tsync_work *ctx;
376
bool found_more_threads = false;
377
378
caller = current;
379
380
guard(rcu)();
381
382
for_each_thread(caller, thread) {
383
/* Skip current, since it is initiating the sync. */
384
if (thread == caller)
385
continue;
386
387
/* Skip exited threads. */
388
if (thread->flags & PF_EXITING)
389
continue;
390
391
/* Skip threads that we already looked at. */
392
if (tsync_works_contains_task(works, thread))
393
continue;
394
395
/*
396
* We found a sibling thread that is not doing its task_work yet, and
397
* which might spawn new threads before our task work runs, so we need
398
* at least one more round in the outer loop.
399
*/
400
found_more_threads = true;
401
402
ctx = tsync_works_provide(works, thread);
403
if (!ctx) {
404
/*
405
* We ran out of preallocated contexts -- we need to try again with
406
* this thread at a later time!
407
* found_more_threads is already true at this point.
408
*/
409
break;
410
}
411
412
ctx->shared_ctx = shared_ctx;
413
414
atomic_inc(&shared_ctx->num_preparing);
415
atomic_inc(&shared_ctx->num_unfinished);
416
417
init_task_work(&ctx->work, restrict_one_thread_callback);
418
err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
419
if (unlikely(err)) {
420
/*
421
* task_work_add() only fails if the task is about to exit. We
422
* checked that earlier, but it can happen as a race. Resume
423
* without setting an error, as the task is probably gone in the
424
* next loop iteration.
425
*/
426
tsync_works_trim(works);
427
428
atomic_dec(&shared_ctx->num_preparing);
429
atomic_dec(&shared_ctx->num_unfinished);
430
}
431
}
432
433
return found_more_threads;
434
}
435
436
/*
437
* cancel_tsync_works - cancel all task works where it is possible
438
*
439
* Task works can be canceled as long as they are still queued and have not
440
* started running. If they get canceled, we decrement
441
* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
442
* completions if needed, as if the task was never scheduled.
443
*/
444
static void cancel_tsync_works(const struct tsync_works *works,
445
struct tsync_shared_context *shared_ctx)
446
{
447
size_t i;
448
449
for (i = 0; i < works->size; i++) {
450
if (WARN_ON_ONCE(!works->works[i]->task))
451
continue;
452
453
if (!task_work_cancel(works->works[i]->task,
454
&works->works[i]->work))
455
continue;
456
457
/* After dequeueing, act as if the task work had executed. */
458
459
if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
460
complete_all(&shared_ctx->all_prepared);
461
462
if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
463
complete_all(&shared_ctx->all_finished);
464
}
465
}
466
467
/*
468
* restrict_sibling_threads - enables a Landlock policy for all sibling threads
469
*/
470
int landlock_restrict_sibling_threads(const struct cred *old_cred,
471
const struct cred *new_cred)
472
{
473
int err;
474
struct tsync_shared_context shared_ctx;
475
struct tsync_works works = {};
476
size_t newly_discovered_threads;
477
bool found_more_threads;
478
479
atomic_set(&shared_ctx.preparation_error, 0);
480
init_completion(&shared_ctx.all_prepared);
481
init_completion(&shared_ctx.ready_to_commit);
482
atomic_set(&shared_ctx.num_unfinished, 1);
483
init_completion(&shared_ctx.all_finished);
484
shared_ctx.old_cred = old_cred;
485
shared_ctx.new_cred = new_cred;
486
shared_ctx.set_no_new_privs = task_no_new_privs(current);
487
488
/*
489
* Serialize concurrent TSYNC operations to prevent deadlocks when
490
* multiple threads call landlock_restrict_self() simultaneously.
491
* If the lock is already held, we gracefully yield by restarting the
492
* syscall. This allows the current thread to process pending
493
* task_works before retrying.
494
*/
495
if (!down_write_trylock(&current->signal->exec_update_lock))
496
return restart_syscall();
497
498
/*
499
* We schedule a pseudo-signal task_work for each of the calling task's
500
* sibling threads. In the task work, each thread:
501
*
502
* 1) runs prepare_creds() and writes back the error to
503
* shared_ctx.preparation_error, if needed.
504
*
505
* 2) signals that it's done with prepare_creds() to the calling task.
506
* (completion "all_prepared").
507
*
508
* 3) waits for the completion "ready_to_commit". This is sent by the
509
* calling task after ensuring that all sibling threads have done
510
* with the "preparation" stage.
511
*
512
* After this barrier is reached, it's safe to read
513
* shared_ctx.preparation_error.
514
*
515
* 4) reads shared_ctx.preparation_error and then either does commit_creds()
516
* or abort_creds().
517
*
518
* 5) signals that it's done altogether (barrier synchronization
519
* "all_finished")
520
*
521
* Unlike seccomp, which modifies sibling tasks directly, we do not need to
522
* acquire the cred_guard_mutex and sighand->siglock:
523
*
524
* - As in our case, all threads are themselves exchanging their own struct
525
* cred through the credentials API, no locks are needed for that.
526
* - Our for_each_thread() loops are protected by RCU.
527
* - We do not acquire a lock to keep the list of sibling threads stable
528
* between our for_each_thread loops. If the list of available sibling
529
* threads changes between these for_each_thread loops, we make up for
530
* that by continuing to look for threads until they are all discovered
531
* and have entered their task_work, where they are unable to spawn new
532
* threads.
533
*/
534
do {
535
/* In RCU read-lock, count the threads we need. */
536
newly_discovered_threads = count_additional_threads(&works);
537
538
if (newly_discovered_threads == 0)
539
break; /* done */
540
541
err = tsync_works_grow_by(&works, newly_discovered_threads,
542
GFP_KERNEL_ACCOUNT);
543
if (err) {
544
atomic_set(&shared_ctx.preparation_error, err);
545
break;
546
}
547
548
/*
549
* The "all_prepared" barrier is used locally to the loop body, this use
550
* of for_each_thread(). We can reset it on each loop iteration because
551
* all previous loop iterations are done with it already.
552
*
553
* num_preparing is initialized to 1 so that the counter can not go to 0
554
* and mark the completion as done before all task works are registered.
555
* We decrement it at the end of the loop body.
556
*/
557
atomic_set(&shared_ctx.num_preparing, 1);
558
reinit_completion(&shared_ctx.all_prepared);
559
560
/*
561
* In RCU read-lock, schedule task work on newly discovered sibling
562
* tasks.
563
*/
564
found_more_threads = schedule_task_work(&works, &shared_ctx);
565
566
/*
567
* Decrement num_preparing for current, to undo that we initialized it
568
* to 1 a few lines above.
569
*/
570
if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
571
if (wait_for_completion_interruptible(
572
&shared_ctx.all_prepared)) {
573
/* In case of interruption, we need to retry the system call. */
574
atomic_set(&shared_ctx.preparation_error,
575
-ERESTARTNOINTR);
576
577
/*
578
* Opportunistic improvement: try to cancel task
579
* works for tasks that did not start running
580
* yet. We do not have a guarantee that it
581
* cancels any of the enqueued task works
582
* because task_work_run() might already have
583
* dequeued them.
584
*/
585
cancel_tsync_works(&works, &shared_ctx);
586
587
/*
588
* Break the loop with error. The cleanup code
589
* after the loop unblocks the remaining
590
* task_works.
591
*/
592
break;
593
}
594
}
595
} while (found_more_threads &&
596
!atomic_read(&shared_ctx.preparation_error));
597
598
/*
599
* We now have either (a) all sibling threads blocking and in "prepared"
600
* state in the task work, or (b) the preparation error is set. Ask all
601
* threads to commit (or abort).
602
*/
603
complete_all(&shared_ctx.ready_to_commit);
604
605
/*
606
* Decrement num_unfinished for current, to undo that we initialized it to 1
607
* at the beginning.
608
*/
609
if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
610
wait_for_completion(&shared_ctx.all_finished);
611
612
tsync_works_release(&works);
613
up_write(&current->signal->exec_update_lock);
614
return atomic_read(&shared_ctx.preparation_error);
615
}
616
617