Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/jail/src/helpers.rs
5392 views
1
// Copyright 2017 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
#![deny(missing_docs)]
6
#![allow(dead_code)]
7
8
use std::path::Path;
9
use std::str;
10
use std::sync::LazyLock;
11
12
use anyhow::bail;
13
use anyhow::Context;
14
use anyhow::Result;
15
#[cfg(feature = "seccomp_trace")]
16
use base::debug;
17
use base::getegid;
18
use base::geteuid;
19
#[cfg(feature = "seccomp_trace")]
20
use base::warn;
21
use libc::c_ulong;
22
use minijail::Minijail;
23
#[cfg(feature = "seccomp_trace")]
24
use static_assertions::const_assert;
25
#[cfg(feature = "seccomp_trace")]
26
use zerocopy::Immutable;
27
#[cfg(feature = "seccomp_trace")]
28
use zerocopy::IntoBytes;
29
30
use crate::config::JailConfig;
31
32
static EMBEDDED_BPFS: LazyLock<std::collections::HashMap<&str, Vec<u8>>> =
33
LazyLock::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
34
35
/// Most devices don't need to open many fds. However, an implementation detail of minijail is that
36
/// after applying this limit, it opens an additional file descriptor to scan the /proc/self/fd
37
/// directory to choose which file descriptors to close in the child process. The open files limit
38
/// therefore has to be higher than the number file descriptors that the parent thread holds open
39
/// before the jail is started.
40
pub const MAX_OPEN_FILES_DEFAULT: u64 = 4096;
41
/// The max open files for gpu processes.
42
const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
43
/// The max open files for jail warden, matching FD_RAW_FAILURE.
44
pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
45
46
/// The user in the jail to run as.
47
pub enum RunAsUser {
48
/// Do not specify the user
49
Unspecified,
50
/// Runs as the same user in the jail as the current user.
51
CurrentUser,
52
/// Runs as the root user in the jail.
53
Root,
54
/// Runs as the specified uid and gid.
55
/// This requires `SandboxConfig::ugid_map` to be set.
56
Specified(u32, u32),
57
}
58
59
/// Config for the sandbox to be created by [Minijail].
60
pub struct SandboxConfig<'a> {
61
/// Whether or not to drop all capabilities in the sandbox.
62
pub limit_caps: bool,
63
log_failures: bool,
64
seccomp_policy_dir: Option<&'a Path>,
65
seccomp_policy_name: &'a str,
66
/// The pair of `uid_map` and `gid_map`.
67
pub ugid_map: Option<(&'a str, &'a str)>,
68
/// The remount mode instead of default MS_PRIVATE.
69
pub remount_mode: Option<c_ulong>,
70
/// Whether to use empty net namespace. Enabled by default.
71
pub namespace_net: bool,
72
/// Whether or not to configure the jail to support bind-mounts.
73
///
74
/// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
75
/// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
76
/// sandbox.
77
pub bind_mounts: bool,
78
/// Specify the user in the jail to run as.
79
pub run_as: RunAsUser,
80
}
81
82
impl<'a> SandboxConfig<'a> {
83
/// Creates [SandboxConfig].
84
pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
85
Self {
86
limit_caps: true,
87
log_failures: jail_config.seccomp_log_failures,
88
seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
89
seccomp_policy_name: policy,
90
ugid_map: None,
91
remount_mode: None,
92
namespace_net: true,
93
bind_mounts: false,
94
run_as: RunAsUser::Unspecified,
95
}
96
}
97
}
98
99
/// Wrapper that cleans up a [Minijail] when it is dropped
100
pub struct ScopedMinijail(pub Minijail);
101
102
impl Drop for ScopedMinijail {
103
fn drop(&mut self) {
104
let _ = self.0.kill();
105
}
106
}
107
108
/// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
109
/// `max_open_files` using `RLIMIT_NOFILE`.
110
///
111
/// If `root` path is "/", the minijail don't change the root.
112
///
113
/// # Arguments
114
///
115
/// * `root` - The root path to be changed to by minijail.
116
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
117
#[allow(clippy::unnecessary_cast)]
118
pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
119
// Validate new root directory. Path::is_dir() also checks the existence.
120
if !root.is_dir() {
121
bail!("{:?} is not a directory, cannot create jail", root);
122
}
123
// chroot accepts absolute path only.
124
if !root.is_absolute() {
125
bail!("{:?} is not absolute path", root);
126
}
127
128
let mut jail = Minijail::new().context("failed to jail device")?;
129
130
// Only pivot_root if we are not re-using the current root directory.
131
if root != Path::new("/") {
132
// Run in a new mount namespace.
133
jail.namespace_vfs();
134
jail.enter_pivot_root(root)
135
.context("failed to pivot root device")?;
136
}
137
138
jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
139
.context("error setting max open files")?;
140
141
Ok(jail)
142
}
143
144
/// Creates a [Minijail] instance which just invokes a jail process and sets
145
/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
146
/// runs as a non-root user without SYS_ADMIN capabilities.
147
///
148
/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
149
/// and `mount namespace`. So, it runs as a non-root user without
150
/// SYS_ADMIN capabilities.
151
///
152
/// Note that since there is no file system isolation provided by this function,
153
/// caller of this function should enforce other security mechanisum such as selinux
154
/// on the host to protect directories.
155
///
156
/// # Arguments
157
///
158
/// * `root` - The root path to checked before the process is jailed
159
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
160
#[allow(clippy::unnecessary_cast)]
161
pub fn create_base_minijail_without_pivot_root(
162
root: &Path,
163
max_open_files: u64,
164
) -> Result<Minijail> {
165
// Validate new root directory. Path::is_dir() also checks the existence.
166
if !root.is_dir() {
167
bail!("{:?} is not a directory, cannot create jail", root);
168
}
169
if !root.is_absolute() {
170
bail!("{:?} is not absolute path", root);
171
}
172
173
let mut jail = Minijail::new().context("failed to jail device")?;
174
jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
175
.context("error setting max open files")?;
176
177
Ok(jail)
178
}
179
180
/// Creates a [Minijail] instance which creates a sandbox.
181
///
182
/// # Arguments
183
///
184
/// * `root` - The root path to be changed to by minijail.
185
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
186
/// * `config` - The [SandboxConfig] to control details of the sandbox.
187
pub fn create_sandbox_minijail(
188
root: &Path,
189
max_open_files: u64,
190
config: &SandboxConfig,
191
) -> Result<Minijail> {
192
let mut jail = create_base_minijail(root, max_open_files)?;
193
194
jail.namespace_pids();
195
jail.namespace_user();
196
jail.namespace_user_disable_setgroups();
197
if config.limit_caps {
198
// Don't need any capabilities.
199
jail.use_caps(0);
200
}
201
match config.run_as {
202
RunAsUser::Unspecified => {
203
if config.bind_mounts && config.ugid_map.is_none() {
204
// Minijail requires to set user/group map to mount extra directories.
205
add_current_user_to_jail(&mut jail)?;
206
}
207
}
208
RunAsUser::CurrentUser => {
209
add_current_user_to_jail(&mut jail)?;
210
}
211
RunAsUser::Root => {
212
// Add the current user as root in the jail.
213
let crosvm_uid = geteuid();
214
let crosvm_gid = getegid();
215
jail.uidmap(&format!("0 {crosvm_uid} 1"))
216
.context("error setting UID map")?;
217
jail.gidmap(&format!("0 {crosvm_gid} 1"))
218
.context("error setting GID map")?;
219
}
220
RunAsUser::Specified(uid, gid) => {
221
if uid != 0 {
222
jail.change_uid(uid)
223
}
224
if gid != 0 {
225
jail.change_gid(gid)
226
}
227
}
228
}
229
if config.bind_mounts {
230
// Create a tmpfs in the device's root directory so that we can bind mount files.
231
// The size=67108864 is size=64*1024*1024 or size=64MB.
232
// TODO(b/267581374): Use appropriate size for tmpfs.
233
jail.mount_with_data(
234
Path::new("none"),
235
Path::new("/"),
236
"tmpfs",
237
(libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
238
"size=67108864",
239
)?;
240
}
241
if let Some((uid_map, gid_map)) = config.ugid_map {
242
jail.uidmap(uid_map).context("error setting UID map")?;
243
jail.gidmap(gid_map).context("error setting GID map")?;
244
}
245
// Run in a new mount namespace.
246
jail.namespace_vfs();
247
248
if config.namespace_net {
249
// Run in an empty network namespace.
250
jail.namespace_net();
251
}
252
253
// Don't allow the device to gain new privileges.
254
jail.no_new_privs();
255
256
#[cfg(feature = "seccomp_trace")]
257
{
258
#[repr(C)]
259
#[derive(Immutable, IntoBytes)]
260
struct sock_filter {
261
/* Filter block */
262
code: u16, /* Actual filter code */
263
jt: u8, /* Jump true */
264
jf: u8, /* Jump false */
265
k: u32, /* Generic multiuse field */
266
}
267
268
// BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
269
// BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
270
const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
271
const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
272
const BPF_RET: u16 = 0x06;
273
const BPF_K: u16 = 0x00;
274
275
// return SECCOMP_RET_LOG for all syscalls
276
const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
277
code: BPF_RET | BPF_K,
278
jt: 0,
279
jf: 0,
280
k: SECCOMP_RET_LOG,
281
};
282
283
warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
284
debug!(
285
"seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
286
config.seccomp_policy_name,
287
read_jail_addr(&jail),
288
);
289
jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
290
.unwrap();
291
}
292
293
#[cfg(not(feature = "seccomp_trace"))]
294
if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
295
let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
296
// By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
297
// is expected to be compiled using "trap" as the failure behavior instead of the default
298
// "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
299
// the built-in pre-compiled policies will be used.
300
// Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
301
// explanation about why the |log_failures| flag forces the use of .policy files (and the
302
// build-time alternative to this run-time flag).
303
let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
304
if bpf_policy_file.exists() && !config.log_failures {
305
jail.parse_seccomp_program(&bpf_policy_file)
306
.with_context(|| {
307
format!(
308
"failed to parse precompiled seccomp policy: {}",
309
bpf_policy_file.display()
310
)
311
})?;
312
} else {
313
// Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
314
// kill the entire device process if a worker thread commits a seccomp violation.
315
jail.set_seccomp_filter_tsync();
316
if config.log_failures {
317
jail.log_seccomp_filter_failures();
318
}
319
let bpf_policy_file = seccomp_policy_path.with_extension("policy");
320
jail.parse_seccomp_filters(&bpf_policy_file)
321
.with_context(|| {
322
format!(
323
"failed to parse seccomp policy: {}",
324
bpf_policy_file.display()
325
)
326
})?;
327
}
328
} else {
329
set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
330
}
331
332
jail.use_seccomp_filter();
333
// Don't do init setup.
334
jail.run_as_init();
335
// Set up requested remount mode instead of default MS_PRIVATE.
336
if let Some(mode) = config.remount_mode {
337
jail.set_remount_mode(mode);
338
}
339
340
Ok(jail)
341
}
342
343
/// Creates a basic [Minijail] if `jail_config` is present.
344
///
345
/// Returns `None` if `jail_config` is none.
346
pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
347
if let Some(jail_config) = jail_config {
348
let config = SandboxConfig::new(jail_config, policy);
349
Ok(Some(create_sandbox_minijail(
350
&jail_config.pivot_root,
351
MAX_OPEN_FILES_DEFAULT,
352
&config,
353
)?))
354
} else {
355
Ok(None)
356
}
357
}
358
359
/// Creates [Minijail] for gpu processes.
360
pub fn create_gpu_minijail(
361
root: &Path,
362
config: &SandboxConfig,
363
render_node_only: bool,
364
snapshot_scratch_directory: Option<&Path>,
365
) -> Result<Minijail> {
366
let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
367
368
// Device nodes required for DRM.
369
let sys_dev_char_path = Path::new("/sys/dev/char");
370
jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
371
372
// Necessary for CGROUP control of the vGPU threads
373
// This is not necessary UNLESS one wants to make use
374
// of the gpu cgroup command line options.
375
let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
376
if sys_cpuset_path.exists() {
377
jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
378
}
379
380
let sys_devices_path = Path::new("/sys/devices");
381
jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
382
383
jail_mount_bind_drm(&mut jail, render_node_only)?;
384
385
// If the ARM specific devices exist on the host, bind mount them in.
386
let mali0_path = Path::new("/dev/mali0");
387
if mali0_path.exists() {
388
jail.mount_bind(mali0_path, mali0_path, true)?;
389
}
390
391
let pvr_sync_path = Path::new("/dev/pvr_sync");
392
if pvr_sync_path.exists() {
393
jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
394
}
395
396
// If the udmabuf driver exists on the host, bind mount it in.
397
let udmabuf_path = Path::new("/dev/udmabuf");
398
if udmabuf_path.exists() {
399
jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
400
}
401
402
// Libraries that are required when mesa drivers are dynamically loaded.
403
jail_mount_bind_if_exists(
404
&mut jail,
405
&[
406
"/usr/lib",
407
"/usr/lib64",
408
"/lib",
409
"/lib64",
410
"/usr/share/drirc.d",
411
"/usr/share/glvnd",
412
"/usr/share/libdrm",
413
"/usr/share/vulkan",
414
],
415
)?;
416
417
// pvr driver requires read access to /proc/self/task/*/comm.
418
mount_proc(&mut jail)?;
419
420
// To enable perfetto tracing, we need to give access to the perfetto service IPC
421
// endpoints.
422
let perfetto_path = Path::new("/run/perfetto");
423
if perfetto_path.exists() {
424
jail.mount_bind(perfetto_path, perfetto_path, true)?;
425
}
426
427
// Provide scratch space for the GPU device to build or unpack snapshots.
428
if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
429
jail.mount_with_data(
430
Path::new("none"),
431
snapshot_scratch_directory,
432
"tmpfs",
433
(libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
434
"size=4294967296",
435
)?;
436
}
437
438
Ok(jail)
439
}
440
441
/// Selectively bind mount drm nodes into `jail` based on `render_node_only`
442
///
443
/// This function will not return an error if drm nodes don't exist
444
pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
445
if render_node_only {
446
const DRM_NUM_NODES: u32 = 63;
447
const DRM_RENDER_NODE_START: u32 = 128;
448
for offset in 0..DRM_NUM_NODES {
449
let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
450
let drm_dri_path = Path::new(&path_str);
451
if !drm_dri_path.exists() {
452
break;
453
}
454
jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
455
}
456
} else {
457
let drm_dri_path = Path::new("/dev/dri");
458
if drm_dri_path.exists() {
459
jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
460
}
461
}
462
463
Ok(())
464
}
465
466
/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
467
///
468
/// This function will not return an error if any of the directories in `dirs` is missing.
469
pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
470
jail: &mut Minijail,
471
dirs: &[P],
472
) -> Result<()> {
473
for dir in dirs {
474
let dir_path = Path::new(dir);
475
if dir_path.exists() {
476
jail.mount_bind(dir_path, dir_path, false)?;
477
}
478
}
479
480
Ok(())
481
}
482
483
/// Mount proc in the sandbox.
484
pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
485
jail.mount(
486
Path::new("proc"),
487
Path::new("/proc"),
488
"proc",
489
(libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
490
)?;
491
Ok(())
492
}
493
494
/// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
495
#[cfg(feature = "seccomp_trace")]
496
pub fn read_jail_addr(jail: &Minijail) -> usize {
497
// We can only hope minijail's rust object will always contain a pointer to C jail struct as the
498
// first field.
499
const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
500
// Safe because it's only doing a read within bound checked by static assert
501
unsafe { *(jail as *const Minijail as *const usize) }
502
}
503
504
/// Set the uid/gid for the jailed process and give a basic id map. This is
505
/// required for bind mounts to work.
506
fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
507
let crosvm_uid = geteuid();
508
let crosvm_gid = getegid();
509
510
jail.uidmap(&format!("{crosvm_uid} {crosvm_uid} 1"))
511
.context("error setting UID map")?;
512
jail.gidmap(&format!("{crosvm_gid} {crosvm_gid} 1"))
513
.context("error setting GID map")?;
514
515
if crosvm_uid != 0 {
516
jail.change_uid(crosvm_uid);
517
}
518
if crosvm_gid != 0 {
519
jail.change_gid(crosvm_gid);
520
}
521
Ok(())
522
}
523
524
/// Set the seccomp policy for a jail from embedded bpfs
525
pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
526
let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
527
format!("failed to find embedded seccomp policy: {seccomp_policy_name}")
528
})?;
529
jail.parse_seccomp_bytes(bpf_program).with_context(|| {
530
format!("failed to parse embedded seccomp policy: {seccomp_policy_name}")
531
})?;
532
Ok(())
533
}
534
535