CoCalc -- helpers.rs

GitHub Repository: google/crosvm
Path: blob/main/jail/src/helpers.rs
⁵³⁹² views
1
// Copyright 2017 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4

5
#![deny(missing_docs)]
6
#![allow(dead_code)]
7

8
use std::path::Path;
9
use std::str;
10
use std::sync::LazyLock;
11

12
use anyhow::bail;
13
use anyhow::Context;
14
use anyhow::Result;
15
#[cfg(feature = "seccomp_trace")]
16
use base::debug;
17
use base::getegid;
18
use base::geteuid;
19
#[cfg(feature = "seccomp_trace")]
20
use base::warn;
21
use libc::c_ulong;
22
use minijail::Minijail;
23
#[cfg(feature = "seccomp_trace")]
24
use static_assertions::const_assert;
25
#[cfg(feature = "seccomp_trace")]
26
use zerocopy::Immutable;
27
#[cfg(feature = "seccomp_trace")]
28
use zerocopy::IntoBytes;
29

30
use crate::config::JailConfig;
31

32
static EMBEDDED_BPFS: LazyLock<std::collections::HashMap<&str, Vec<u8>>> =
33
    LazyLock::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
34

35
/// Most devices don't need to open many fds. However, an implementation detail of minijail is that
36
/// after applying this limit, it opens an additional file descriptor to scan the /proc/self/fd
37
/// directory to choose which file descriptors to close in the child process. The open files limit
38
/// therefore has to be higher than the number file descriptors that the parent thread holds open
39
/// before the jail is started.
40
pub const MAX_OPEN_FILES_DEFAULT: u64 = 4096;
41
/// The max open files for gpu processes.
42
const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
43
/// The max open files for jail warden, matching FD_RAW_FAILURE.
44
pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
45

46
/// The user in the jail to run as.
47
pub enum RunAsUser {
48
    /// Do not specify the user
49
    Unspecified,
50
    /// Runs as the same user in the jail as the current user.
51
    CurrentUser,
52
    /// Runs as the root user in the jail.
53
    Root,
54
    /// Runs as the specified uid and gid.
55
    /// This requires `SandboxConfig::ugid_map` to be set.
56
    Specified(u32, u32),
57
}
58

59
/// Config for the sandbox to be created by [Minijail].
60
pub struct SandboxConfig<'a> {
61
    /// Whether or not to drop all capabilities in the sandbox.
62
    pub limit_caps: bool,
63
    log_failures: bool,
64
    seccomp_policy_dir: Option<&'a Path>,
65
    seccomp_policy_name: &'a str,
66
    /// The pair of `uid_map` and `gid_map`.
67
    pub ugid_map: Option<(&'a str, &'a str)>,
68
    /// The remount mode instead of default MS_PRIVATE.
69
    pub remount_mode: Option<c_ulong>,
70
    /// Whether to use empty net namespace. Enabled by default.
71
    pub namespace_net: bool,
72
    /// Whether or not to configure the jail to support bind-mounts.
73
    ///
74
    /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
75
    /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
76
    /// sandbox.
77
    pub bind_mounts: bool,
78
    /// Specify the user in the jail to run as.
79
    pub run_as: RunAsUser,
80
}
81

82
impl<'a> SandboxConfig<'a> {
83
    /// Creates [SandboxConfig].
84
    pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
85
        Self {
86
            limit_caps: true,
87
            log_failures: jail_config.seccomp_log_failures,
88
            seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
89
            seccomp_policy_name: policy,
90
            ugid_map: None,
91
            remount_mode: None,
92
            namespace_net: true,
93
            bind_mounts: false,
94
            run_as: RunAsUser::Unspecified,
95
        }
96
    }
97
}
98

99
/// Wrapper that cleans up a [Minijail] when it is dropped
100
pub struct ScopedMinijail(pub Minijail);
101

102
impl Drop for ScopedMinijail {
103
    fn drop(&mut self) {
104
        let _ = self.0.kill();
105
    }
106
}
107

108
/// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
109
/// `max_open_files` using `RLIMIT_NOFILE`.
110
///
111
/// If `root` path is "/", the minijail don't change the root.
112
///
113
/// # Arguments
114
///
115
/// * `root` - The root path to be changed to by minijail.
116
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
117
#[allow(clippy::unnecessary_cast)]
118
pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
119
    // Validate new root directory. Path::is_dir() also checks the existence.
120
    if !root.is_dir() {
121
        bail!("{:?} is not a directory, cannot create jail", root);
122
    }
123
    // chroot accepts absolute path only.
124
    if !root.is_absolute() {
125
        bail!("{:?} is not absolute path", root);
126
    }
127

128
    let mut jail = Minijail::new().context("failed to jail device")?;
129

130
    // Only pivot_root if we are not re-using the current root directory.
131
    if root != Path::new("/") {
132
        // Run in a new mount namespace.
133
        jail.namespace_vfs();
134
        jail.enter_pivot_root(root)
135
            .context("failed to pivot root device")?;
136
    }
137

138
    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
139
        .context("error setting max open files")?;
140

141
    Ok(jail)
142
}
143

144
/// Creates a [Minijail] instance which just invokes a jail process and sets
145
/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
146
/// runs as a non-root user without SYS_ADMIN capabilities.
147
///
148
/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
149
/// and `mount namespace`. So, it runs as a non-root user without
150
/// SYS_ADMIN capabilities.
151
///
152
/// Note that since there is no file system isolation provided by this function,
153
/// caller of this function should enforce other security mechanisum such as selinux
154
/// on the host to protect directories.
155
///
156
/// # Arguments
157
///
158
/// * `root` - The root path to checked before the process is jailed
159
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
160
#[allow(clippy::unnecessary_cast)]
161
pub fn create_base_minijail_without_pivot_root(
162
    root: &Path,
163
    max_open_files: u64,
164
) -> Result<Minijail> {
165
    // Validate new root directory. Path::is_dir() also checks the existence.
166
    if !root.is_dir() {
167
        bail!("{:?} is not a directory, cannot create jail", root);
168
    }
169
    if !root.is_absolute() {
170
        bail!("{:?} is not absolute path", root);
171
    }
172

173
    let mut jail = Minijail::new().context("failed to jail device")?;
174
    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
175
        .context("error setting max open files")?;
176

177
    Ok(jail)
178
}
179

180
/// Creates a [Minijail] instance which creates a sandbox.
181
///
182
/// # Arguments
183
///
184
/// * `root` - The root path to be changed to by minijail.
185
/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
186
/// * `config` - The [SandboxConfig] to control details of the sandbox.
187
pub fn create_sandbox_minijail(
188
    root: &Path,
189
    max_open_files: u64,
190
    config: &SandboxConfig,
191
) -> Result<Minijail> {
192
    let mut jail = create_base_minijail(root, max_open_files)?;
193

194
    jail.namespace_pids();
195
    jail.namespace_user();
196
    jail.namespace_user_disable_setgroups();
197
    if config.limit_caps {
198
        // Don't need any capabilities.
199
        jail.use_caps(0);
200
    }
201
    match config.run_as {
202
        RunAsUser::Unspecified => {
203
            if config.bind_mounts && config.ugid_map.is_none() {
204
                // Minijail requires to set user/group map to mount extra directories.
205
                add_current_user_to_jail(&mut jail)?;
206
            }
207
        }
208
        RunAsUser::CurrentUser => {
209
            add_current_user_to_jail(&mut jail)?;
210
        }
211
        RunAsUser::Root => {
212
            // Add the current user as root in the jail.
213
            let crosvm_uid = geteuid();
214
            let crosvm_gid = getegid();
215
            jail.uidmap(&format!("0 {crosvm_uid} 1"))
216
                .context("error setting UID map")?;
217
            jail.gidmap(&format!("0 {crosvm_gid} 1"))
218
                .context("error setting GID map")?;
219
        }
220
        RunAsUser::Specified(uid, gid) => {
221
            if uid != 0 {
222
                jail.change_uid(uid)
223
            }
224
            if gid != 0 {
225
                jail.change_gid(gid)
226
            }
227
        }
228
    }
229
    if config.bind_mounts {
230
        // Create a tmpfs in the device's root directory so that we can bind mount files.
231
        // The size=67108864 is size=64*1024*1024 or size=64MB.
232
        // TODO(b/267581374): Use appropriate size for tmpfs.
233
        jail.mount_with_data(
234
            Path::new("none"),
235
            Path::new("/"),
236
            "tmpfs",
237
            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
238
            "size=67108864",
239
        )?;
240
    }
241
    if let Some((uid_map, gid_map)) = config.ugid_map {
242
        jail.uidmap(uid_map).context("error setting UID map")?;
243
        jail.gidmap(gid_map).context("error setting GID map")?;
244
    }
245
    // Run in a new mount namespace.
246
    jail.namespace_vfs();
247

248
    if config.namespace_net {
249
        // Run in an empty network namespace.
250
        jail.namespace_net();
251
    }
252

253
    // Don't allow the device to gain new privileges.
254
    jail.no_new_privs();
255

256
    #[cfg(feature = "seccomp_trace")]
257
    {
258
        #[repr(C)]
259
        #[derive(Immutable, IntoBytes)]
260
        struct sock_filter {
261
            /* Filter block */
262
            code: u16, /* Actual filter code */
263
            jt: u8,    /* Jump true */
264
            jf: u8,    /* Jump false */
265
            k: u32,    /* Generic multiuse field */
266
        }
267

268
        // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
269
        // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
270
        const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
271
        const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
272
        const BPF_RET: u16 = 0x06;
273
        const BPF_K: u16 = 0x00;
274

275
        // return SECCOMP_RET_LOG for all syscalls
276
        const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
277
            code: BPF_RET | BPF_K,
278
            jt: 0,
279
            jf: 0,
280
            k: SECCOMP_RET_LOG,
281
        };
282

283
        warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
284
        debug!(
285
            "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
286
            config.seccomp_policy_name,
287
            read_jail_addr(&jail),
288
        );
289
        jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
290
            .unwrap();
291
    }
292

293
    #[cfg(not(feature = "seccomp_trace"))]
294
    if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
295
        let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
296
        // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
297
        // is expected to be compiled using "trap" as the failure behavior instead of the default
298
        // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
299
        // the built-in pre-compiled policies will be used.
300
        // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
301
        // explanation about why the |log_failures| flag forces the use of .policy files (and the
302
        // build-time alternative to this run-time flag).
303
        let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
304
        if bpf_policy_file.exists() && !config.log_failures {
305
            jail.parse_seccomp_program(&bpf_policy_file)
306
                .with_context(|| {
307
                    format!(
308
                        "failed to parse precompiled seccomp policy: {}",
309
                        bpf_policy_file.display()
310
                    )
311
                })?;
312
        } else {
313
            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
314
            // kill the entire device process if a worker thread commits a seccomp violation.
315
            jail.set_seccomp_filter_tsync();
316
            if config.log_failures {
317
                jail.log_seccomp_filter_failures();
318
            }
319
            let bpf_policy_file = seccomp_policy_path.with_extension("policy");
320
            jail.parse_seccomp_filters(&bpf_policy_file)
321
                .with_context(|| {
322
                    format!(
323
                        "failed to parse seccomp policy: {}",
324
                        bpf_policy_file.display()
325
                    )
326
                })?;
327
        }
328
    } else {
329
        set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
330
    }
331

332
    jail.use_seccomp_filter();
333
    // Don't do init setup.
334
    jail.run_as_init();
335
    // Set up requested remount mode instead of default MS_PRIVATE.
336
    if let Some(mode) = config.remount_mode {
337
        jail.set_remount_mode(mode);
338
    }
339

340
    Ok(jail)
341
}
342

343
/// Creates a basic [Minijail] if `jail_config` is present.
344
///
345
/// Returns `None` if `jail_config` is none.
346
pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
347
    if let Some(jail_config) = jail_config {
348
        let config = SandboxConfig::new(jail_config, policy);
349
        Ok(Some(create_sandbox_minijail(
350
            &jail_config.pivot_root,
351
            MAX_OPEN_FILES_DEFAULT,
352
            &config,
353
        )?))
354
    } else {
355
        Ok(None)
356
    }
357
}
358

359
/// Creates [Minijail] for gpu processes.
360
pub fn create_gpu_minijail(
361
    root: &Path,
362
    config: &SandboxConfig,
363
    render_node_only: bool,
364
    snapshot_scratch_directory: Option<&Path>,
365
) -> Result<Minijail> {
366
    let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
367

368
    // Device nodes required for DRM.
369
    let sys_dev_char_path = Path::new("/sys/dev/char");
370
    jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
371

372
    // Necessary for CGROUP control of the vGPU threads
373
    // This is not necessary UNLESS one wants to make use
374
    // of the gpu cgroup command line options.
375
    let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
376
    if sys_cpuset_path.exists() {
377
        jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
378
    }
379

380
    let sys_devices_path = Path::new("/sys/devices");
381
    jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
382

383
    jail_mount_bind_drm(&mut jail, render_node_only)?;
384

385
    // If the ARM specific devices exist on the host, bind mount them in.
386
    let mali0_path = Path::new("/dev/mali0");
387
    if mali0_path.exists() {
388
        jail.mount_bind(mali0_path, mali0_path, true)?;
389
    }
390

391
    let pvr_sync_path = Path::new("/dev/pvr_sync");
392
    if pvr_sync_path.exists() {
393
        jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
394
    }
395

396
    // If the udmabuf driver exists on the host, bind mount it in.
397
    let udmabuf_path = Path::new("/dev/udmabuf");
398
    if udmabuf_path.exists() {
399
        jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
400
    }
401

402
    // Libraries that are required when mesa drivers are dynamically loaded.
403
    jail_mount_bind_if_exists(
404
        &mut jail,
405
        &[
406
            "/usr/lib",
407
            "/usr/lib64",
408
            "/lib",
409
            "/lib64",
410
            "/usr/share/drirc.d",
411
            "/usr/share/glvnd",
412
            "/usr/share/libdrm",
413
            "/usr/share/vulkan",
414
        ],
415
    )?;
416

417
    // pvr driver requires read access to /proc/self/task/*/comm.
418
    mount_proc(&mut jail)?;
419

420
    // To enable perfetto tracing, we need to give access to the perfetto service IPC
421
    // endpoints.
422
    let perfetto_path = Path::new("/run/perfetto");
423
    if perfetto_path.exists() {
424
        jail.mount_bind(perfetto_path, perfetto_path, true)?;
425
    }
426

427
    // Provide scratch space for the GPU device to build or unpack snapshots.
428
    if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
429
        jail.mount_with_data(
430
            Path::new("none"),
431
            snapshot_scratch_directory,
432
            "tmpfs",
433
            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
434
            "size=4294967296",
435
        )?;
436
    }
437

438
    Ok(jail)
439
}
440

441
/// Selectively bind mount drm nodes into `jail` based on `render_node_only`
442
///
443
/// This function will not return an error if drm nodes don't exist
444
pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
445
    if render_node_only {
446
        const DRM_NUM_NODES: u32 = 63;
447
        const DRM_RENDER_NODE_START: u32 = 128;
448
        for offset in 0..DRM_NUM_NODES {
449
            let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
450
            let drm_dri_path = Path::new(&path_str);
451
            if !drm_dri_path.exists() {
452
                break;
453
            }
454
            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
455
        }
456
    } else {
457
        let drm_dri_path = Path::new("/dev/dri");
458
        if drm_dri_path.exists() {
459
            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
460
        }
461
    }
462

463
    Ok(())
464
}
465

466
/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
467
///
468
/// This function will not return an error if any of the directories in `dirs` is missing.
469
pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
470
    jail: &mut Minijail,
471
    dirs: &[P],
472
) -> Result<()> {
473
    for dir in dirs {
474
        let dir_path = Path::new(dir);
475
        if dir_path.exists() {
476
            jail.mount_bind(dir_path, dir_path, false)?;
477
        }
478
    }
479

480
    Ok(())
481
}
482

483
/// Mount proc in the sandbox.
484
pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
485
    jail.mount(
486
        Path::new("proc"),
487
        Path::new("/proc"),
488
        "proc",
489
        (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
490
    )?;
491
    Ok(())
492
}
493

494
/// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
495
#[cfg(feature = "seccomp_trace")]
496
pub fn read_jail_addr(jail: &Minijail) -> usize {
497
    // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
498
    // first field.
499
    const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
500
    // Safe because it's only doing a read within bound checked by static assert
501
    unsafe { *(jail as *const Minijail as *const usize) }
502
}
503

504
/// Set the uid/gid for the jailed process and give a basic id map. This is
505
/// required for bind mounts to work.
506
fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
507
    let crosvm_uid = geteuid();
508
    let crosvm_gid = getegid();
509

510
    jail.uidmap(&format!("{crosvm_uid} {crosvm_uid} 1"))
511
        .context("error setting UID map")?;
512
    jail.gidmap(&format!("{crosvm_gid} {crosvm_gid} 1"))
513
        .context("error setting GID map")?;
514

515
    if crosvm_uid != 0 {
516
        jail.change_uid(crosvm_uid);
517
    }
518
    if crosvm_gid != 0 {
519
        jail.change_gid(crosvm_gid);
520
    }
521
    Ok(())
522
}
523

524
/// Set the seccomp policy for a jail from embedded bpfs
525
pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
526
    let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
527
        format!("failed to find embedded seccomp policy: {seccomp_policy_name}")
528
    })?;
529
    jail.parse_seccomp_bytes(bpf_program).with_context(|| {
530
        format!("failed to parse embedded seccomp policy: {seccomp_policy_name}")
531
    })?;
532
    Ok(())
533
}
534

535
Product

Resources

Company