mirror of
https://github.com/michael-yuji/xc.git
synced 2026-03-18 06:45:39 +01:00
609 lines
23 KiB
Rust
609 lines
23 KiB
Rust
// Copyright (c) 2023 Yan Ka, Chiu.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions, and the following disclaimer,
|
|
// without modification, immediately at the beginning of the file.
|
|
// 2. The name of the author may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
|
|
// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
// SUCH DAMAGE.
|
|
|
|
use crate::auth::Credential;
|
|
use crate::devfs_store::DevfsRulesetStore;
|
|
use crate::ipc::InstantiateRequest;
|
|
use crate::resources::volume::Volume;
|
|
use crate::resources::Resources;
|
|
|
|
use anyhow::Context;
|
|
use freebsd::event::EventFdNotify;
|
|
use freebsd::libc::{EINVAL, EIO, ENOENT, EPERM};
|
|
use ipc::packet::codec::Maybe;
|
|
use oci_util::image_reference::ImageReference;
|
|
use std::collections::HashMap;
|
|
use std::net::IpAddr;
|
|
use std::os::fd::{AsRawFd, RawFd};
|
|
use varutil::string_interpolation::InterpolatedString;
|
|
use xc::container::request::{CopyFileReq, Mount, NetworkAllocRequest};
|
|
use xc::errx;
|
|
use xc::format::devfs_rules::DevfsRule;
|
|
use xc::models::exec::{Jexec, StdioMode};
|
|
use xc::models::jail_image::JailImage;
|
|
use xc::models::network::{DnsSetting, IpAssign, MainAddressSelector, PortRedirection};
|
|
use xc::models::EnforceStatfs;
|
|
|
|
pub struct CheckedInstantiateRequest {
|
|
pub(crate) request: InstantiateRequest,
|
|
pub(crate) devfs_rules: Vec<DevfsRule>,
|
|
allowing: Vec<String>,
|
|
copies: Vec<xc::container::request::CopyFileReq>,
|
|
enforce_statfs: EnforceStatfs,
|
|
pub(crate) image: JailImage,
|
|
}
|
|
|
|
impl CheckedInstantiateRequest {
|
|
pub(crate) fn new(
|
|
mut request: InstantiateRequest,
|
|
oci_config: &JailImage,
|
|
cred: &Credential,
|
|
resources: &mut Resources,
|
|
) -> anyhow::Result<CheckedInstantiateRequest> {
|
|
let existing_ifaces = freebsd::net::ifconfig::interfaces()?;
|
|
let available_allows = xc::util::jail_allowables();
|
|
let config = oci_config.jail_config();
|
|
|
|
let mut envs = request.envs.clone();
|
|
|
|
if let Some(ifaces) = request.tun_interfaces.as_ref() {
|
|
for tun in ifaces.iter() {
|
|
envs.insert(tun.to_string(), "dummy".to_string());
|
|
}
|
|
}
|
|
|
|
if let Some(ifaces) = request.tap_interfaces.as_ref() {
|
|
for tun in ifaces.iter() {
|
|
envs.insert(tun.to_string(), "dummy".to_string());
|
|
}
|
|
}
|
|
|
|
for (key, env_spec) in config.envs.iter() {
|
|
let key_string = key.to_string();
|
|
if !request.envs.contains_key(&key_string) {
|
|
if let Some(value) = &env_spec.default_value {
|
|
envs.insert(key_string, value.clone());
|
|
} else if env_spec.required {
|
|
let extra_info = env_spec
|
|
.description
|
|
.as_ref()
|
|
.map(|d| format!(" - {d}"))
|
|
.unwrap_or_default();
|
|
errx!(
|
|
ENOENT,
|
|
"missing required environment variable: {key}{extra_info}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
for assign in request.ips.iter() {
|
|
let iface = &assign.interface;
|
|
if !existing_ifaces.contains(iface) {
|
|
errx!(ENOENT, "missing network interface {iface}");
|
|
}
|
|
}
|
|
|
|
let mut allowing = {
|
|
let mut allows = Vec::new();
|
|
for allow in config.allow.iter() {
|
|
if available_allows.contains(allow) {
|
|
allows.push(allow.to_string());
|
|
} else if let Some(("mount", fs)) = allow.split_once('.') {
|
|
errx!(ENOENT, "{allow} is not available; maybe try kldload {fs}");
|
|
} else {
|
|
errx!(EIO, "allow.{allow} is not available on this system");
|
|
}
|
|
}
|
|
allows
|
|
};
|
|
|
|
if !request.jail_datasets.is_empty() {
|
|
allowing.push("mount".to_string());
|
|
allowing.push("mount.zfs".to_string());
|
|
}
|
|
|
|
let enforce_statfs = if request.jail_datasets.is_empty() {
|
|
EnforceStatfs::Strict
|
|
} else {
|
|
EnforceStatfs::BelowRoot
|
|
};
|
|
|
|
let copies: Vec<xc::container::request::CopyFileReq> = request
|
|
.copies
|
|
.move_to_vec()
|
|
.iter()
|
|
.map(|c| xc::container::request::CopyFileReq {
|
|
source: c.source.as_raw_fd(),
|
|
destination: c.destination.clone(),
|
|
})
|
|
.collect();
|
|
|
|
let mut mount_specs = oci_config.jail_config().mounts;
|
|
|
|
for req in request.mount_req.clone().to_vec().iter() {
|
|
let source_path = std::path::Path::new(&req.source);
|
|
|
|
if !source_path.is_absolute() {
|
|
let name = source_path.to_string_lossy().to_string();
|
|
match resources.query_volume(&name) {
|
|
None => {
|
|
errx!(ENOENT, "no such volume {name}")
|
|
}
|
|
Some(volume) => {
|
|
if !volume.can_mount(cred.uid()) {
|
|
errx!(EPERM, "this user is not allowed to mount the volume")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mount_specs.remove(req.dest.to_str().unwrap());
|
|
}
|
|
|
|
for (key, spec) in mount_specs.iter() {
|
|
if spec.required {
|
|
errx!(ENOENT, "Required volume {key:?} is not mounted");
|
|
}
|
|
}
|
|
|
|
for req in request.ipreq.iter() {
|
|
let network = req.network();
|
|
if !resources.has_network(network) {
|
|
errx!(ENOENT, "no such network: {network}");
|
|
}
|
|
}
|
|
|
|
'iter_groups: for group in request.netgroups.iter() {
|
|
for req in request.ipreq.iter() {
|
|
if req.network() == group {
|
|
continue 'iter_groups;
|
|
}
|
|
}
|
|
errx!(
|
|
ENOENT,
|
|
"cannot add container to netgroup {group} as network {group} does not exist"
|
|
)
|
|
}
|
|
|
|
let mut devfs_rules = Vec::new();
|
|
for rule in config.devfs_rules.iter() {
|
|
let applied = rule.apply(&envs);
|
|
match applied.parse::<xc::format::devfs_rules::DevfsRule>() {
|
|
Err(error) => {
|
|
errx!(EINVAL, "invaild devfs rule: [{applied}], {error}")
|
|
}
|
|
Ok(rule) => devfs_rules.push(rule),
|
|
}
|
|
}
|
|
|
|
Ok(CheckedInstantiateRequest {
|
|
request,
|
|
copies,
|
|
devfs_rules,
|
|
allowing,
|
|
enforce_statfs,
|
|
image: oci_config.clone(),
|
|
})
|
|
}
|
|
}
|
|
|
|
pub struct InstantiateBlueprint {
|
|
pub id: String,
|
|
pub name: String,
|
|
pub hostname: String,
|
|
pub image_reference: Option<ImageReference>,
|
|
pub vnet: bool,
|
|
pub mount_req: Vec<Mount>,
|
|
pub copies: Vec<CopyFileReq>,
|
|
pub main_norun: bool,
|
|
pub init_norun: bool,
|
|
pub deinit_norun: bool,
|
|
pub extra_layers: Vec<RawFd>,
|
|
pub persist: bool,
|
|
pub no_clean: bool,
|
|
pub dns: DnsSetting,
|
|
pub origin_image: Option<JailImage>,
|
|
pub allowing: Vec<String>,
|
|
pub linux: bool,
|
|
pub init: Vec<Jexec>,
|
|
pub deinit: Vec<Jexec>,
|
|
pub main: Option<Jexec>,
|
|
pub ips: Vec<IpAssign>,
|
|
pub ipreq: Vec<NetworkAllocRequest>,
|
|
pub envs: HashMap<String, String>,
|
|
pub devfs_ruleset_id: u16,
|
|
pub ip_alloc: Vec<IpAssign>,
|
|
pub default_router: Option<IpAddr>,
|
|
pub main_started_notify: Option<EventFdNotify>,
|
|
pub create_only: bool,
|
|
pub linux_no_create_sys_dir: bool,
|
|
pub linux_no_create_proc_dir: bool,
|
|
pub linux_no_mount_sys: bool,
|
|
pub linux_no_mount_proc: bool,
|
|
pub override_props: HashMap<String, String>,
|
|
pub enforce_statfs: EnforceStatfs,
|
|
pub jailed_datasets: Vec<std::path::PathBuf>,
|
|
pub children_max: u32,
|
|
pub main_ip_selector: Option<MainAddressSelector>,
|
|
pub created_interfaces: Vec<String>,
|
|
pub port_redirections: Vec<PortRedirection>,
|
|
}
|
|
|
|
impl InstantiateBlueprint {
|
|
pub(crate) fn new(
|
|
id: &str,
|
|
request: CheckedInstantiateRequest,
|
|
devfs_store: &mut DevfsRulesetStore,
|
|
cred: &Credential,
|
|
resources: &mut Resources,
|
|
) -> anyhow::Result<InstantiateBlueprint> {
|
|
let oci_config = &request.image;
|
|
let existing_ifaces = freebsd::net::ifconfig::interfaces()?;
|
|
let config = oci_config.jail_config();
|
|
let name = match request.request.name {
|
|
None => format!("xc-{id}"),
|
|
Some(name) => {
|
|
if name.parse::<isize>().is_ok() {
|
|
errx!(EINVAL, "Jail name cannot be numeric")
|
|
} else if name.contains('.') {
|
|
errx!(EINVAL, "Jail name cannot contain dot (.)")
|
|
} else {
|
|
name
|
|
}
|
|
}
|
|
};
|
|
|
|
let hostname = request.request.hostname.unwrap_or_else(|| name.to_string());
|
|
let vnet = request.request.vnet || config.vnet;
|
|
let mut tuntap_ifaces = Vec::new();
|
|
let mut envs = request.request.envs.clone();
|
|
|
|
for (key, env_spec) in config.envs.iter() {
|
|
let key_string = key.to_string();
|
|
if !request.request.envs.contains_key(&key_string) {
|
|
if let Some(value) = &env_spec.default_value {
|
|
envs.insert(key_string, value.clone());
|
|
} else if env_spec.required {
|
|
let extra_info = env_spec
|
|
.description
|
|
.as_ref()
|
|
.map(|d| format!(" - {d}"))
|
|
.unwrap_or_default();
|
|
errx!(
|
|
ENOENT,
|
|
"missing required environment variable: {key}{extra_info}"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
if config.linux {
|
|
if !freebsd::exists_kld("linux64") {
|
|
errx!(
|
|
EIO,
|
|
"Linux image require linux64 kmod but it is missing from the system"
|
|
);
|
|
} else if xc::util::elf_abi_fallback_brand() != "3" {
|
|
errx!(EIO, "kern.elf64.fallback_brand did not set to 3 (Linux)");
|
|
}
|
|
}
|
|
|
|
let main_started_notify = match request.request.main_started_notify {
|
|
ipc::packet::codec::Maybe::None => None,
|
|
ipc::packet::codec::Maybe::Some(x) => Some(EventFdNotify::from_fd(x.as_raw_fd())),
|
|
};
|
|
|
|
let main_exited_notify = match request.request.main_exited_fd {
|
|
ipc::packet::codec::Maybe::None => None,
|
|
ipc::packet::codec::Maybe::Some(x) => Some(x.as_raw_fd()),
|
|
};
|
|
|
|
let mut ip_alloc = request.request.ips.clone();
|
|
|
|
let mut default_router = None;
|
|
|
|
for req in request.request.ipreq.iter() {
|
|
match resources.allocate(vnet, req, id) {
|
|
Ok((alloc, router)) => {
|
|
if !existing_ifaces.contains(&alloc.interface) {
|
|
errx!(ENOENT, "missing network interface {}", &alloc.interface);
|
|
}
|
|
if let Some(router) = router {
|
|
if default_router.is_none() {
|
|
default_router = Some(router);
|
|
}
|
|
}
|
|
ip_alloc.push(alloc);
|
|
}
|
|
Err(error) => match error {
|
|
crate::resources::network::Error::Sqlite(error) => {
|
|
Err(error).context("sqlite error on address allocation")?;
|
|
}
|
|
crate::resources::network::Error::AllocationFailure(network) => {
|
|
errx!(ENOENT, "cannot allocate address from network {network}")
|
|
}
|
|
crate::resources::network::Error::AddressUsed(addr) => {
|
|
errx!(ENOENT, "address {addr} already consumed")
|
|
}
|
|
crate::resources::network::Error::InvalidAddress(addr, network) => {
|
|
errx!(EINVAL, "{addr} is not in the subnet of {network}")
|
|
}
|
|
crate::resources::network::Error::NoSuchNetwork(network) => {
|
|
errx!(ENOENT, "network {network} is missing from config file")
|
|
}
|
|
crate::resources::network::Error::Other(error) => {
|
|
Err(error).context("error occured during address allocation")?;
|
|
}
|
|
crate::resources::network::Error::Unsupported => {
|
|
Err(error).context("this network does not support such operation")?;
|
|
}
|
|
},
|
|
};
|
|
}
|
|
|
|
for tap in request.request.tap_interfaces.unwrap_or_default() {
|
|
let interface = freebsd::net::ifconfig::create_tap()?;
|
|
tuntap_ifaces.push(interface.to_string());
|
|
envs.insert(tap, interface.clone());
|
|
ip_alloc.push(IpAssign {
|
|
network: None,
|
|
addresses: Vec::new(),
|
|
interface,
|
|
});
|
|
}
|
|
|
|
for tun in request.request.tun_interfaces.unwrap_or_default() {
|
|
let interface = freebsd::net::ifconfig::create_tap()?;
|
|
tuntap_ifaces.push(interface.to_string());
|
|
envs.insert(tun, interface.clone());
|
|
ip_alloc.push(IpAssign {
|
|
network: None,
|
|
addresses: Vec::new(),
|
|
interface,
|
|
});
|
|
}
|
|
|
|
let mut mount_req = Vec::new();
|
|
|
|
for special_mount in config.special_mounts.iter() {
|
|
if special_mount.mount_type.as_str() == "procfs" {
|
|
mount_req.push(Mount::procfs(&special_mount.mount_point));
|
|
} else if special_mount.mount_type.as_str() == "fdescfs" {
|
|
mount_req.push(Mount::fdescfs(&special_mount.mount_point));
|
|
}
|
|
}
|
|
|
|
let mut mount_specs = oci_config.jail_config().mounts;
|
|
let mut added_mount_specs = HashMap::new();
|
|
|
|
for req in request.request.mount_req.clone().to_vec().iter() {
|
|
let source_path = std::path::Path::new(&req.source);
|
|
|
|
let volume = if !source_path.is_absolute() {
|
|
let name = source_path.to_string_lossy().to_string();
|
|
match resources.query_volume(&name) {
|
|
None => {
|
|
errx!(ENOENT, "no such volume {name}")
|
|
}
|
|
Some(volume) => {
|
|
if !volume.can_mount(cred.uid()) {
|
|
errx!(EPERM, "this user is not allowed to mount the volume")
|
|
} else {
|
|
volume
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
match &req.evid {
|
|
Maybe::None => errx!(ENOENT, "missing evidence"),
|
|
Maybe::Some(fd) => {
|
|
let Ok(stat) = freebsd::nix::sys::stat::fstat(fd.as_raw_fd()) else {
|
|
errx!(ENOENT, "cannot stat evidence")
|
|
};
|
|
let check_stat = freebsd::nix::sys::stat::stat(source_path).unwrap();
|
|
if stat.st_ino != check_stat.st_ino {
|
|
errx!(ENOENT, "evidence inode mismatch")
|
|
}
|
|
_ = freebsd::nix::unistd::close(fd.as_raw_fd());
|
|
Volume::adhoc(source_path)
|
|
}
|
|
}
|
|
};
|
|
|
|
let mount_spec = mount_specs.remove(req.dest.to_str().unwrap());
|
|
|
|
if mount_spec.is_some() {
|
|
added_mount_specs.insert(&req.dest, mount_spec.clone().unwrap());
|
|
}
|
|
|
|
let mount = resources.mount(id, cred, req, mount_spec.as_ref(), &volume)?;
|
|
mount_req.push(mount);
|
|
}
|
|
|
|
for dataset in request.request.jail_datasets.iter() {
|
|
if resources.dataset_tracker.is_jailed(dataset) {
|
|
errx!(
|
|
EPERM,
|
|
"another container is using this dataset: {dataset:?}"
|
|
)
|
|
} else {
|
|
resources.dataset_tracker.set_jailed(id, dataset)
|
|
}
|
|
}
|
|
|
|
let mut devfs_rules = vec![
|
|
"include 1".to_string(),
|
|
"include 2".to_string(),
|
|
"include 3".to_string(),
|
|
"include 4".to_string(),
|
|
"include 5".to_string(),
|
|
];
|
|
|
|
if request.request.enable_usdt {
|
|
devfs_rules.push("path dtrace unhide".to_string());
|
|
devfs_rules.push("path dtrace/helper unhide".to_string());
|
|
}
|
|
|
|
if request.request.enable_pf {
|
|
devfs_rules.push("path pf unhide".to_string());
|
|
}
|
|
|
|
for rule in request.devfs_rules.iter() {
|
|
devfs_rules.push(rule.to_string());
|
|
}
|
|
|
|
for name in tuntap_ifaces.iter() {
|
|
devfs_rules.push(format!("path {name} unhide"));
|
|
}
|
|
|
|
let devfs_ruleset_id = devfs_store.get_ruleset_id(&devfs_rules);
|
|
|
|
envs.insert("XC_DEVFS_RULESET".to_string(), devfs_ruleset_id.to_string());
|
|
|
|
let main = match &request.request.entry_point {
|
|
Some(spec) => {
|
|
let args = {
|
|
let mut args = Vec::new();
|
|
for arg in spec.entry_point_args.iter() {
|
|
args.push(arg.parse::<InterpolatedString>().context("invalid arg")?);
|
|
}
|
|
args
|
|
};
|
|
|
|
let selected_entry = match &spec.entry_point {
|
|
Some(name) => name.to_string(),
|
|
None => config
|
|
.default_entry_point
|
|
.unwrap_or_else(|| "main".to_string()),
|
|
};
|
|
|
|
let mut entry_point =
|
|
if let Some(entry_point) = config.entry_points.get(&selected_entry) {
|
|
entry_point.clone()
|
|
} else {
|
|
xc::models::exec::Exec {
|
|
exec: selected_entry,
|
|
args,
|
|
default_args: Vec::new(),
|
|
environ: HashMap::new(),
|
|
work_dir: None,
|
|
required_envs: Vec::new(),
|
|
clear_env: false,
|
|
user: request.request.user.clone(),
|
|
group: request.request.group.clone(),
|
|
}
|
|
};
|
|
|
|
if request.request.user.is_some() {
|
|
entry_point.user = request.request.user.clone();
|
|
}
|
|
|
|
if request.request.group.is_some() {
|
|
entry_point.group = request.request.group.clone();
|
|
}
|
|
|
|
let mut jexec = entry_point.resolve_args(&envs, &spec.entry_point_args)?;
|
|
if request.request.use_tty {
|
|
jexec.output_mode = StdioMode::Terminal;
|
|
} else {
|
|
jexec.output_mode = StdioMode::Forward {
|
|
stdin: request.request.stdin.to_option().map(|fd| fd.as_raw_fd()),
|
|
stdout: request.request.stdout.to_option().map(|fd| fd.as_raw_fd()),
|
|
stderr: request.request.stderr.to_option().map(|fd| fd.as_raw_fd()),
|
|
};
|
|
}
|
|
jexec.notify = main_exited_notify.map(|a| a.as_raw_fd());
|
|
tracing::warn!("jexec: {jexec:#?}");
|
|
Some(jexec)
|
|
}
|
|
None => None,
|
|
};
|
|
|
|
let init = config
|
|
.init
|
|
.clone()
|
|
.into_iter()
|
|
.map(|s| s.resolve_args(&envs, &[]))
|
|
.collect::<Result<Vec<_>, _>>()?;
|
|
|
|
let deinit = config
|
|
.deinit
|
|
.clone()
|
|
.into_iter()
|
|
.map(|s| s.resolve_args(&envs, &[]))
|
|
.collect::<Result<Vec<_>, _>>()?;
|
|
|
|
let extra_layers = request
|
|
.request
|
|
.extra_layers
|
|
.to_vec()
|
|
.into_iter()
|
|
.map(|fd| fd.as_raw_fd())
|
|
.collect::<Vec<_>>();
|
|
|
|
Ok(InstantiateBlueprint {
|
|
name,
|
|
hostname,
|
|
id: id.to_string(),
|
|
vnet,
|
|
init,
|
|
deinit,
|
|
extra_layers,
|
|
main,
|
|
ips: request.request.ips,
|
|
ipreq: request.request.ipreq,
|
|
mount_req,
|
|
linux: config.linux,
|
|
deinit_norun: request.request.deinit_norun,
|
|
init_norun: request.request.init_norun,
|
|
main_norun: request.request.main_norun,
|
|
persist: request.request.persist,
|
|
no_clean: request.request.no_clean,
|
|
dns: request.request.dns,
|
|
origin_image: Some(oci_config.clone()),
|
|
allowing: request.allowing,
|
|
image_reference: Some(request.request.image_reference),
|
|
copies: request.copies,
|
|
envs,
|
|
ip_alloc,
|
|
devfs_ruleset_id,
|
|
default_router,
|
|
main_started_notify,
|
|
create_only: request.request.create_only,
|
|
linux_no_create_sys_dir: request.request.linux_no_create_sys_dir,
|
|
linux_no_create_proc_dir: request.request.linux_no_create_proc_dir,
|
|
linux_no_mount_sys: request.request.linux_no_mount_sys,
|
|
linux_no_mount_proc: request.request.linux_no_mount_proc,
|
|
override_props: request.request.override_props,
|
|
enforce_statfs: request.enforce_statfs,
|
|
jailed_datasets: request.request.jail_datasets,
|
|
children_max: request.request.children_max,
|
|
main_ip_selector: request.request.main_ip_selector,
|
|
created_interfaces: tuntap_ifaces,
|
|
port_redirections: request.request.port_redirections,
|
|
})
|
|
}
|
|
}
|