// SPDX-License-Identifier: GPL-2.0-only1/*2* Landlock - Cross-thread ruleset enforcement3*4* Copyright © 2025 Google LLC5*/67#include <linux/atomic.h>8#include <linux/cleanup.h>9#include <linux/completion.h>10#include <linux/cred.h>11#include <linux/errno.h>12#include <linux/overflow.h>13#include <linux/rcupdate.h>14#include <linux/sched.h>15#include <linux/sched/signal.h>16#include <linux/sched/task.h>17#include <linux/slab.h>18#include <linux/task_work.h>1920#include "cred.h"21#include "tsync.h"2223/*24* Shared state between multiple threads which are enforcing Landlock rulesets25* in lockstep with each other.26*/27struct tsync_shared_context {28/* The old and tentative new creds of the calling thread. */29const struct cred *old_cred;30const struct cred *new_cred;3132/* True if sibling tasks need to set the no_new_privs flag. */33bool set_no_new_privs;3435/* An error encountered in preparation step, or 0. */36atomic_t preparation_error;3738/*39* Barrier after preparation step in restrict_one_thread.40* The calling thread waits for completion.41*42* Re-initialized on every round of looking for newly spawned threads.43*/44atomic_t num_preparing;45struct completion all_prepared;4647/* Sibling threads wait for completion. */48struct completion ready_to_commit;4950/*51* Barrier after commit step (used by syscall impl to wait for52* completion).53*/54atomic_t num_unfinished;55struct completion all_finished;56};5758struct tsync_work {59struct callback_head work;60struct task_struct *task;61struct tsync_shared_context *shared_ctx;62};6364/*65* restrict_one_thread - update a thread's Landlock domain in lockstep with the66* other threads in the same process67*68* When this is run, the same function gets run in all other threads in the same69* process (except for the calling thread which called landlock_restrict_self).70* The concurrently running invocations of restrict_one_thread coordinate71* through the shared ctx object to do their work in lockstep to implement72* all-or-nothing semantics for enforcing the new Landlock domain.73*74* Afterwards, depending on the presence of an error, all threads either commit75* or abort the prepared credentials. The commit operation can not fail any76* more.77*/78static void restrict_one_thread(struct tsync_shared_context *ctx)79{80int err;81struct cred *cred = NULL;8283if (current_cred() == ctx->old_cred) {84/*85* Switch out old_cred with new_cred, if possible.86*87* In the common case, where all threads initially point to the same88* struct cred, this optimization avoids creating separate redundant89* credentials objects for each, which would all have the same contents.90*91* Note: We are intentionally dropping the const qualifier here, because92* it is required by commit_creds() and abort_creds().93*/94cred = (struct cred *)get_cred(ctx->new_cred);95} else {96/* Else, prepare new creds and populate them. */97cred = prepare_creds();9899if (!cred) {100atomic_set(&ctx->preparation_error, -ENOMEM);101102/*103* Even on error, we need to adhere to the protocol and coordinate104* with concurrently running invocations.105*/106if (atomic_dec_return(&ctx->num_preparing) == 0)107complete_all(&ctx->all_prepared);108109goto out;110}111112landlock_cred_copy(landlock_cred(cred),113landlock_cred(ctx->new_cred));114}115116/*117* Barrier: Wait until all threads are done preparing.118* After this point, we can have no more failures.119*/120if (atomic_dec_return(&ctx->num_preparing) == 0)121complete_all(&ctx->all_prepared);122123/*124* Wait for signal from calling thread that it's safe to read the125* preparation error now and we are ready to commit (or abort).126*/127wait_for_completion(&ctx->ready_to_commit);128129/* Abort the commit if any of the other threads had an error. */130err = atomic_read(&ctx->preparation_error);131if (err) {132abort_creds(cred);133goto out;134}135136/*137* Make sure that all sibling tasks fulfill the no_new_privs prerequisite.138* (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in139* kernel/seccomp.c)140*/141if (ctx->set_no_new_privs)142task_set_no_new_privs(current);143144commit_creds(cred);145146out:147/* Notify the calling thread once all threads are done */148if (atomic_dec_return(&ctx->num_unfinished) == 0)149complete_all(&ctx->all_finished);150}151152/*153* restrict_one_thread_callback - task_work callback for restricting a thread154*155* Calls restrict_one_thread with the struct landlock_shared_tsync_context.156*/157static void restrict_one_thread_callback(struct callback_head *work)158{159struct tsync_work *ctx = container_of(work, struct tsync_work, work);160161restrict_one_thread(ctx->shared_ctx);162}163164/*165* struct tsync_works - a growable array of per-task contexts166*167* The zero-initialized struct represents the empty array.168*/169struct tsync_works {170struct tsync_work **works;171size_t size;172size_t capacity;173};174175/*176* tsync_works_provide - provides a preallocated tsync_work for the given task177*178* This also stores a task pointer in the context and increments the reference179* count of the task.180*181* This function may fail in the case where we did not preallocate sufficient182* capacity. This can legitimately happen if new threads get started after we183* grew the capacity.184*185* Returns:186* A pointer to the preallocated context struct, with task filled in.187*188* NULL, if we ran out of preallocated context structs.189*/190static struct tsync_work *tsync_works_provide(struct tsync_works *s,191struct task_struct *task)192{193struct tsync_work *ctx;194195if (s->size >= s->capacity)196return NULL;197198ctx = s->works[s->size];199s->size++;200201ctx->task = get_task_struct(task);202return ctx;203}204205/**206* tsync_works_trim - Put the last tsync_work element207*208* @s: TSYNC works to trim.209*210* Put the last task and decrement the size of @s.211*212* This helper does not cancel a running task, but just reset the last element213* to zero.214*/215static void tsync_works_trim(struct tsync_works *s)216{217struct tsync_work *ctx;218219if (WARN_ON_ONCE(s->size <= 0))220return;221222ctx = s->works[s->size - 1];223224/*225* For consistency, remove the task from ctx so that it does not look like226* we handed it a task_work.227*/228put_task_struct(ctx->task);229*ctx = (typeof(*ctx)){};230231/*232* Cancel the tsync_works_provide() change to recycle the reserved memory233* for the next thread, if any. This also ensures that cancel_tsync_works()234* and tsync_works_release() do not see any NULL task pointers.235*/236s->size--;237}238239/*240* tsync_works_grow_by - preallocates space for n more contexts in s241*242* On a successful return, the subsequent n calls to tsync_works_provide() are243* guaranteed to succeed. (size + n <= capacity)244*245* Returns:246* -ENOMEM if the (re)allocation fails247248* 0 if the allocation succeeds, partially succeeds, or no reallocation249* was needed250*/251static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)252{253size_t i;254size_t new_capacity;255struct tsync_work **works;256struct tsync_work *work;257258if (check_add_overflow(s->size, n, &new_capacity))259return -EOVERFLOW;260261/* No need to reallocate if s already has sufficient capacity. */262if (new_capacity <= s->capacity)263return 0;264265works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),266flags);267if (!works)268return -ENOMEM;269270s->works = works;271272for (i = s->capacity; i < new_capacity; i++) {273work = kzalloc_obj(*work, flags);274if (!work) {275/*276* Leave the object in a consistent state,277* but return an error.278*/279s->capacity = i;280return -ENOMEM;281}282s->works[i] = work;283}284s->capacity = new_capacity;285return 0;286}287288/*289* tsync_works_contains - checks for presence of task in s290*/291static bool tsync_works_contains_task(const struct tsync_works *s,292const struct task_struct *task)293{294size_t i;295296for (i = 0; i < s->size; i++)297if (s->works[i]->task == task)298return true;299300return false;301}302303/*304* tsync_works_release - frees memory held by s and drops all task references305*306* This does not free s itself, only the data structures held by it.307*/308static void tsync_works_release(struct tsync_works *s)309{310size_t i;311312for (i = 0; i < s->size; i++) {313if (WARN_ON_ONCE(!s->works[i]->task))314continue;315316put_task_struct(s->works[i]->task);317}318319for (i = 0; i < s->capacity; i++)320kfree(s->works[i]);321322kfree(s->works);323s->works = NULL;324s->size = 0;325s->capacity = 0;326}327328/*329* count_additional_threads - counts the sibling threads that are not in works330*/331static size_t count_additional_threads(const struct tsync_works *works)332{333const struct task_struct *caller, *thread;334size_t n = 0;335336caller = current;337338guard(rcu)();339340for_each_thread(caller, thread) {341/* Skip current, since it is initiating the sync. */342if (thread == caller)343continue;344345/* Skip exited threads. */346if (thread->flags & PF_EXITING)347continue;348349/* Skip threads that we have already seen. */350if (tsync_works_contains_task(works, thread))351continue;352353n++;354}355return n;356}357358/*359* schedule_task_work - adds task_work for all eligible sibling threads360* which have not been scheduled yet361*362* For each added task_work, atomically increments shared_ctx->num_preparing and363* shared_ctx->num_unfinished.364*365* Returns:366* true, if at least one eligible sibling thread was found367*/368static bool schedule_task_work(struct tsync_works *works,369struct tsync_shared_context *shared_ctx)370{371int err;372const struct task_struct *caller;373struct task_struct *thread;374struct tsync_work *ctx;375bool found_more_threads = false;376377caller = current;378379guard(rcu)();380381for_each_thread(caller, thread) {382/* Skip current, since it is initiating the sync. */383if (thread == caller)384continue;385386/* Skip exited threads. */387if (thread->flags & PF_EXITING)388continue;389390/* Skip threads that we already looked at. */391if (tsync_works_contains_task(works, thread))392continue;393394/*395* We found a sibling thread that is not doing its task_work yet, and396* which might spawn new threads before our task work runs, so we need397* at least one more round in the outer loop.398*/399found_more_threads = true;400401ctx = tsync_works_provide(works, thread);402if (!ctx) {403/*404* We ran out of preallocated contexts -- we need to try again with405* this thread at a later time!406* found_more_threads is already true at this point.407*/408break;409}410411ctx->shared_ctx = shared_ctx;412413atomic_inc(&shared_ctx->num_preparing);414atomic_inc(&shared_ctx->num_unfinished);415416init_task_work(&ctx->work, restrict_one_thread_callback);417err = task_work_add(thread, &ctx->work, TWA_SIGNAL);418if (unlikely(err)) {419/*420* task_work_add() only fails if the task is about to exit. We421* checked that earlier, but it can happen as a race. Resume422* without setting an error, as the task is probably gone in the423* next loop iteration.424*/425tsync_works_trim(works);426427atomic_dec(&shared_ctx->num_preparing);428atomic_dec(&shared_ctx->num_unfinished);429}430}431432return found_more_threads;433}434435/*436* cancel_tsync_works - cancel all task works where it is possible437*438* Task works can be canceled as long as they are still queued and have not439* started running. If they get canceled, we decrement440* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two441* completions if needed, as if the task was never scheduled.442*/443static void cancel_tsync_works(const struct tsync_works *works,444struct tsync_shared_context *shared_ctx)445{446size_t i;447448for (i = 0; i < works->size; i++) {449if (WARN_ON_ONCE(!works->works[i]->task))450continue;451452if (!task_work_cancel(works->works[i]->task,453&works->works[i]->work))454continue;455456/* After dequeueing, act as if the task work had executed. */457458if (atomic_dec_return(&shared_ctx->num_preparing) == 0)459complete_all(&shared_ctx->all_prepared);460461if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)462complete_all(&shared_ctx->all_finished);463}464}465466/*467* restrict_sibling_threads - enables a Landlock policy for all sibling threads468*/469int landlock_restrict_sibling_threads(const struct cred *old_cred,470const struct cred *new_cred)471{472int err;473struct tsync_shared_context shared_ctx;474struct tsync_works works = {};475size_t newly_discovered_threads;476bool found_more_threads;477478atomic_set(&shared_ctx.preparation_error, 0);479init_completion(&shared_ctx.all_prepared);480init_completion(&shared_ctx.ready_to_commit);481atomic_set(&shared_ctx.num_unfinished, 1);482init_completion(&shared_ctx.all_finished);483shared_ctx.old_cred = old_cred;484shared_ctx.new_cred = new_cred;485shared_ctx.set_no_new_privs = task_no_new_privs(current);486487/*488* Serialize concurrent TSYNC operations to prevent deadlocks when489* multiple threads call landlock_restrict_self() simultaneously.490* If the lock is already held, we gracefully yield by restarting the491* syscall. This allows the current thread to process pending492* task_works before retrying.493*/494if (!down_write_trylock(¤t->signal->exec_update_lock))495return restart_syscall();496497/*498* We schedule a pseudo-signal task_work for each of the calling task's499* sibling threads. In the task work, each thread:500*501* 1) runs prepare_creds() and writes back the error to502* shared_ctx.preparation_error, if needed.503*504* 2) signals that it's done with prepare_creds() to the calling task.505* (completion "all_prepared").506*507* 3) waits for the completion "ready_to_commit". This is sent by the508* calling task after ensuring that all sibling threads have done509* with the "preparation" stage.510*511* After this barrier is reached, it's safe to read512* shared_ctx.preparation_error.513*514* 4) reads shared_ctx.preparation_error and then either does commit_creds()515* or abort_creds().516*517* 5) signals that it's done altogether (barrier synchronization518* "all_finished")519*520* Unlike seccomp, which modifies sibling tasks directly, we do not need to521* acquire the cred_guard_mutex and sighand->siglock:522*523* - As in our case, all threads are themselves exchanging their own struct524* cred through the credentials API, no locks are needed for that.525* - Our for_each_thread() loops are protected by RCU.526* - We do not acquire a lock to keep the list of sibling threads stable527* between our for_each_thread loops. If the list of available sibling528* threads changes between these for_each_thread loops, we make up for529* that by continuing to look for threads until they are all discovered530* and have entered their task_work, where they are unable to spawn new531* threads.532*/533do {534/* In RCU read-lock, count the threads we need. */535newly_discovered_threads = count_additional_threads(&works);536537if (newly_discovered_threads == 0)538break; /* done */539540err = tsync_works_grow_by(&works, newly_discovered_threads,541GFP_KERNEL_ACCOUNT);542if (err) {543atomic_set(&shared_ctx.preparation_error, err);544break;545}546547/*548* The "all_prepared" barrier is used locally to the loop body, this use549* of for_each_thread(). We can reset it on each loop iteration because550* all previous loop iterations are done with it already.551*552* num_preparing is initialized to 1 so that the counter can not go to 0553* and mark the completion as done before all task works are registered.554* We decrement it at the end of the loop body.555*/556atomic_set(&shared_ctx.num_preparing, 1);557reinit_completion(&shared_ctx.all_prepared);558559/*560* In RCU read-lock, schedule task work on newly discovered sibling561* tasks.562*/563found_more_threads = schedule_task_work(&works, &shared_ctx);564565/*566* Decrement num_preparing for current, to undo that we initialized it567* to 1 a few lines above.568*/569if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {570if (wait_for_completion_interruptible(571&shared_ctx.all_prepared)) {572/* In case of interruption, we need to retry the system call. */573atomic_set(&shared_ctx.preparation_error,574-ERESTARTNOINTR);575576/*577* Opportunistic improvement: try to cancel task578* works for tasks that did not start running579* yet. We do not have a guarantee that it580* cancels any of the enqueued task works581* because task_work_run() might already have582* dequeued them.583*/584cancel_tsync_works(&works, &shared_ctx);585586/*587* Break the loop with error. The cleanup code588* after the loop unblocks the remaining589* task_works.590*/591break;592}593}594} while (found_more_threads &&595!atomic_read(&shared_ctx.preparation_error));596597/*598* We now have either (a) all sibling threads blocking and in "prepared"599* state in the task work, or (b) the preparation error is set. Ask all600* threads to commit (or abort).601*/602complete_all(&shared_ctx.ready_to_commit);603604/*605* Decrement num_unfinished for current, to undo that we initialized it to 1606* at the beginning.607*/608if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)609wait_for_completion(&shared_ctx.all_finished);610611tsync_works_release(&works);612up_write(¤t->signal->exec_update_lock);613return atomic_read(&shared_ctx.preparation_error);614}615616617