linux.rs: unify jail creation

This change unifies two substantially similiar segments of code used to create a jail. BUG=none TEST=Ran 'build_test'. Local build, deployed to DUT, and verified that termina VM could still be used. Change-Id: Ib1f2f9bc5cfe1e6c9f3633af7e23f52e5eafe3c7 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2057744 Tested-by: Matt Delco <delco@chromium.org> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Dylan Reid <dgreid@chromium.org> Commit-Queue: Matt Delco <delco@chromium.org>
2025-02-11 04:26:38 +00:00 · 2020-02-14 13:24:36 -08:00 · 2020-02-14 13:24:36 -08:00 · c24ad78624
commit c24ad78624
parent de92ad05c7
1 changed files with 83 additions and 82 deletions
--- a/src/linux.rs
+++ b/src/linux.rs
@ -304,55 +304,78 @@ fn get_max_open_files() -> Result<libc::rlim64_t> {
    }
 }

+struct SandboxConfig<'a> {
+    limit_caps: bool,
+    log_failures: bool,
+    seccomp_policy: &'a Path,
+    uid_map: Option<&'a str>,
+    gid_map: Option<&'a str>,
+}
+
 fn create_base_minijail(
    root: &Path,
-    log_failures: bool,
-    seccomp_policy: &Path,
+    r_limit: Option<u64>,
+    config: Option<&SandboxConfig>,
 ) -> Result<Minijail> {
    // All child jails run in a new user namespace without any users mapped,
    // they run as nobody unless otherwise configured.
    let mut j = Minijail::new().map_err(Error::DeviceJail)?;
-    j.namespace_pids();
-    j.namespace_user();
-    j.namespace_user_disable_setgroups();
-    // Don't need any capabilities.
-    j.use_caps(0);
+
+    if let Some(config) = config {
+        j.namespace_pids();
+        j.namespace_user();
+        j.namespace_user_disable_setgroups();
+        if config.limit_caps {
+            // Don't need any capabilities.
+            j.use_caps(0);
+        }
+        if let Some(uid_map) = config.uid_map {
+            j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
+        }
+        if let Some(gid_map) = config.gid_map {
+            j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
+        }
+        // Run in an empty network namespace.
+        j.namespace_net();
+        // Apply the block device seccomp policy.
+        j.no_new_privs();
+
+        // By default we'll prioritize using the pre-compiled .bpf over the .policy
+        // file (the .bpf is expected to be compiled using "trap" as the failure
+        // behavior instead of the default "kill" behavior).
+        // Refer to the code comment for the "seccomp-log-failures"
+        // command-line parameter for an explanation about why the |log_failures|
+        // flag forces the use of .policy files (and the build-time alternative to
+        // this run-time flag).
+        let bpf_policy_file = config.seccomp_policy.with_extension("bpf");
+        if bpf_policy_file.exists() && !config.log_failures {
+            j.parse_seccomp_program(&bpf_policy_file)
+                .map_err(Error::DeviceJail)?;
+        } else {
+            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
+            // which will correctly kill the entire device process if a worker
+            // thread commits a seccomp violation.
+            j.set_seccomp_filter_tsync();
+            if config.log_failures {
+                j.log_seccomp_filter_failures();
+            }
+            j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy"))
+                .map_err(Error::DeviceJail)?;
+        }
+        j.use_seccomp_filter();
+        // Don't do init setup.
+        j.run_as_init();
+    }
+
    // Create a new mount namespace with an empty root FS.
    j.namespace_vfs();
    j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
-    // Run in an empty network namespace.
-    j.namespace_net();
-    // Most devices don't need to open many fds.
-    j.set_rlimit(libc::RLIMIT_NOFILE as i32, 1024, 1024)
-        .map_err(Error::SettingMaxOpenFiles)?;
-    // Apply the block device seccomp policy.
-    j.no_new_privs();

-    // By default we'll prioritize using the pre-compiled .bpf over the .policy
-    // file (the .bpf is expected to be compiled using "trap" as the failure
-    // behavior instead of the default "kill" behavior).
-    // Refer to the code comment for the "seccomp-log-failures"
-    // command-line parameter for an explanation about why the |log_failures|
-    // flag forces the use of .policy files (and the build-time alternative to
-    // this run-time flag).
-    let bpf_policy_file = seccomp_policy.with_extension("bpf");
-    if bpf_policy_file.exists() && !log_failures {
-        j.parse_seccomp_program(&bpf_policy_file)
-            .map_err(Error::DeviceJail)?;
-    } else {
-        // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
-        // which will correctly kill the entire device process if a worker
-        // thread commits a seccomp violation.
-        j.set_seccomp_filter_tsync();
-        if log_failures {
-            j.log_seccomp_filter_failures();
-        }
-        j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
-            .map_err(Error::DeviceJail)?;
-    }
-    j.use_seccomp_filter();
-    // Don't do init setup.
-    j.run_as_init();
+    // Most devices don't need to open many fds.
+    let limit = if let Some(r) = r_limit { r } else { 1024u64 };
+    j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)
+        .map_err(Error::SettingMaxOpenFiles)?;
+
    Ok(j)
 }

@ -365,11 +388,14 @@ fn simple_jail(cfg: &Config, policy: &str) -> Result<Option<Minijail>> {
            return Err(Error::PivotRootDoesntExist(pivot_root));
        }
        let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy);
-        Ok(Some(create_base_minijail(
-            root_path,
-            cfg.seccomp_log_failures,
-            &policy_path,
-        )?))
+        let config = SandboxConfig {
+            limit_caps: true,
+            log_failures: cfg.seccomp_log_failures,
+            seccomp_policy: &policy_path,
+            uid_map: None,
+            gid_map: None,
+        };
+        Ok(Some(create_base_minijail(root_path, None, Some(&config))?))
    } else {
        Ok(None)
    }
@ -774,45 +800,20 @@ fn create_fs_device(
    tag: &str,
    fs_cfg: virtio::fs::passthrough::Config,
 ) -> DeviceResult {
-    let mut j = Minijail::new().map_err(Error::DeviceJail)?;
-
-    if cfg.sandbox {
-        j.namespace_pids();
-        j.namespace_user();
-        j.namespace_user_disable_setgroups();
-        j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
-        j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
-
-        // Run in an empty network namespace.
-        j.namespace_net();
-
-        j.no_new_privs();
-
-        // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
-        // the entire device process if a worker thread commits a seccomp violation.
-        let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
-        j.set_seccomp_filter_tsync();
-        if cfg.seccomp_log_failures {
-            j.log_seccomp_filter_failures();
-        }
-        j.parse_seccomp_filters(&seccomp_policy)
-            .map_err(Error::DeviceJail)?;
-        j.use_seccomp_filter();
-
-        // Don't do init setup.
-        j.run_as_init();
-    }
-
-    // Create a new mount namespace with the source directory as the root. We need this even when
-    // sandboxing is disabled as the server relies on the host kernel to prevent path traversals
-    // from leaking out of the shared directory.
-    j.namespace_vfs();
-    j.enter_pivot_root(src).map_err(Error::DevicePivotRoot)?;
-
-    // The file server opens a lot of fds and needs a really high open file limit.
    let max_open_files = get_max_open_files()?;
-    j.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
-        .map_err(Error::SettingMaxOpenFiles)?;
+    let j = if cfg.sandbox {
+        let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
+        let config = SandboxConfig {
+            limit_caps: false,
+            uid_map: Some(uid_map),
+            gid_map: Some(gid_map),
+            log_failures: cfg.seccomp_log_failures,
+            seccomp_policy: &seccomp_policy,
+        };
+        create_base_minijail(src, Some(max_open_files), Some(&config))?
+    } else {
+        create_base_minijail(src, Some(max_open_files), None)?
+    };

    // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
    // when num_queues > 1.