Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for systemd managed cgroups
Browse files Browse the repository at this point in the history
nimrodshn committed Jun 16, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 2f14718 commit 46edd7f
Showing 13 changed files with 529 additions and 15 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- run: rustup component add clippy
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev
- uses: actions-rs/clippy-check@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
@@ -30,6 +31,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev
- run: cargo install cargo-when
- name: Build
run: ./build.sh
120 changes: 114 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -22,6 +22,8 @@ once_cell = "1.6.0"
futures = { version = "0.3", features = ["thread-pool"] }
regex = "1.5"
oci_spec = { version = "0.1.0", path = "./oci_spec" }
systemd = { version = "0.8", default-features = false }
dbus = "0.9.2"

[dev-dependencies]
oci_spec = { version = "0.1.0", path = "./oci_spec", features = ["proptests"] }
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -40,12 +40,29 @@ For other platforms, please use the devcontainer that we prepared.
- Rust(See [here](https://www.rust-lang.org/tools/install))
- Docker(See [here](https://docs.docker.com/engine/install))

## Building
## Dependencies
```sh
$ cargo install cargo-when
```

### Debian, Ubuntu and related distributions
```sh
$ sudo dnf install \
pkg-config \
libsystemd-dev \
libdbus-glib-1-dev
```


### Fedora, Centos, RHEL and related distributions
```sh
$ cargo install cargo-when # installs prerequisite for building youki
$ sudo dnf install \
pkg-config \
systemd-dev \
dbus-devel
```

## Build
```sh
$ git clone [email protected]:containers/youki.git
$ cd youki
27 changes: 25 additions & 2 deletions src/cgroups/common.rs
Original file line number Diff line number Diff line change
@@ -6,11 +6,11 @@ use std::{
path::{Path, PathBuf},
};


use anyhow::{bail, Context, Result};
use nix::unistd::Pid;
use oci_spec::LinuxResources;
use procfs::process::Process;
use systemd::daemon::booted;

use crate::cgroups::v1;
use crate::cgroups::v2;
@@ -91,7 +91,10 @@ pub fn get_supported_cgroup_fs() -> Result<Vec<Cgroup>> {
Ok(cgroups)
}

pub fn create_cgroup_manager<P: Into<PathBuf>>(cgroup_path: P) -> Result<Box<dyn CgroupManager>> {
pub fn create_cgroup_manager<P: Into<PathBuf>>(
cgroup_path: P,
systemd_cgroup: bool,
) -> Result<Box<dyn CgroupManager>> {
let cgroup_mount = Process::myself()?
.mountinfo()?
.into_iter()
@@ -109,6 +112,16 @@ pub fn create_cgroup_manager<P: Into<PathBuf>>(cgroup_path: P) -> Result<Box<dyn
}
(None, Some(cgroup2)) => {
log::info!("cgroup manager V2 will be used");
if systemd_cgroup {
if !booted()? {
bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available");
}
log::info!("systemd cgroup manager will be used");
return Ok(Box::new(v2::SystemDCGroupManager::new(
cgroup2.mount_point,
cgroup_path.into(),
)?));
}
Ok(Box::new(v2::manager::Manager::new(
cgroup2.mount_point,
cgroup_path.into(),
@@ -119,6 +132,16 @@ pub fn create_cgroup_manager<P: Into<PathBuf>>(cgroup_path: P) -> Result<Box<dyn
match cgroup_override {
Ok(v) if v == "true" => {
log::info!("cgroup manager V2 will be used");
if systemd_cgroup {
if !booted()? {
bail!("systemd cgroup flag passed, but systemd support for managing cgroups is not available");
}
log::info!("systemd cgroup manager will be used");
return Ok(Box::new(v2::SystemDCGroupManager::new(
cgroup2.mount_point,
cgroup_path.into(),
)?));
}
Ok(Box::new(v2::manager::Manager::new(
cgroup2.mount_point,
cgroup_path.into(),
2 changes: 1 addition & 1 deletion src/cgroups/v1/util.rs
Original file line number Diff line number Diff line change
@@ -45,4 +45,4 @@ pub fn get_subsystem_mount_points(subsystem: &str) -> Result<PathBuf> {
})
.map(|m| m.mount_point)
.ok_or_else(|| anyhow!("could not find mountpoint for {}", subsystem))
}
}
2 changes: 2 additions & 0 deletions src/cgroups/v2/mod.rs
Original file line number Diff line number Diff line change
@@ -7,4 +7,6 @@ mod io;
pub mod manager;
mod memory;
mod pids;
pub mod systemd_manager;
pub mod util;
pub use systemd_manager::SystemDCGroupManager;
308 changes: 308 additions & 0 deletions src/cgroups/v2/systemd_manager.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
use std::{
fs::{self},
os::unix::fs::PermissionsExt,
};

use anyhow::{anyhow, bail, Result};
use nix::unistd::Pid;
use oci_spec::LinuxResources;
use std::path::{Path, PathBuf};

use super::{cpu::Cpu, cpuset::CpuSet, hugetlb::HugeTlb, io::Io, memory::Memory, pids::Pids};
use crate::cgroups::common;
use crate::cgroups::common::{write_cgroup_file, CgroupManager};
use crate::cgroups::v2::controller::Controller;
use crate::cgroups::v2::controller_type::ControllerType;
use crate::utils::PathBufExt;

const CGROUP_PROCS: &str = "cgroup.procs";
const CGROUP_CONTROLLERS: &str = "cgroup.controllers";
const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control";

// v2 systemd only supports cpu, io, memory and pids.
const CONTROLLER_TYPES: &[ControllerType] = &[
ControllerType::Cpu,
ControllerType::Io,
ControllerType::Memory,
ControllerType::Pids,
];

/// SystemDCGroupManager is a driver for managing cgroups via systemd.
pub struct SystemDCGroupManager {
root_path: PathBuf,
cgroups_path: CgroupsPath,
}

/// Represents the systemd cgroups path:
/// It should be of the form [slice]:[scope_prefix]:[name].
/// The slice is the "parent" and should be expanded properly,
/// see expand_slice below.
struct CgroupsPath {
parent: String,
scope: String,
name: String,
}

impl SystemDCGroupManager {
pub fn new(root_path: PathBuf, cgroups_path: PathBuf) -> Result<Self> {
// cgroups path may never be empty as it is defaulted to `/youki`
// see 'get_cgroup_path' under utils.rs.
// if cgroups_path was provided it should be of the form [slice]:[scope_prefix]:[name],
// for example: "system.slice:docker:1234".
let mut parent = "";
let scope;
let name;
if cgroups_path.starts_with("/youki") {
scope = "youki";
name = cgroups_path
.strip_prefix("/youki/")?
.to_str()
.ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))?;
} else {
let parts = cgroups_path
.to_str()
.ok_or_else(|| anyhow!("Failed to parse cgroupsPath field."))?
.split(':')
.collect::<Vec<&str>>();
parent = parts[0];
scope = parts[1];
name = parts[2];
}

// TODO: create the systemd unit using a dbus client.

Ok(SystemDCGroupManager {
root_path,
cgroups_path: CgroupsPath {
parent: parent.to_string(),
scope: scope.to_string(),
name: name.to_string(),
},
})
}

/// get_unit_name returns the unit (scope) name from the path provided by the user
/// for example: foo:docker:bar returns in '/docker-bar.scope'
fn get_unit_name(&self) -> String {
// By default we create a scope unless specified explicitly.
if !self.cgroups_path.name.ends_with(".slice") {
return format!(
"{}-{}.scope",
self.cgroups_path.scope, self.cgroups_path.name
);
}
self.cgroups_path.name.clone()
}

// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. For example, 'test-a-b.slice' becomes
// '/test.slice/test-a.slice/test-a-b.slice'.
fn expand_slice(&self, slice: String) -> Result<PathBuf> {
let suffix = ".slice";
if slice.len() <= suffix.len() || !slice.ends_with(suffix) {
anyhow!("invalid slice name: {}", slice);
}
if slice.contains('/') {
anyhow!("invalid slice name: {}", slice);
}
let mut path = "".to_owned();
let mut prefix = "".to_owned();
let slice_name = slice.trim_end_matches(suffix);
// if input was -.slice, we should just return root now
if slice_name == "-" {
return Ok(Path::new("/").to_path_buf());
}
for component in slice_name.split('-') {
if component.is_empty() {
anyhow!("Invalid slice name: {}", slice);
}
// Append the component to the path and to the prefix.
path = format!("{}/{}{}{}", path, prefix, component, suffix);
prefix = format!("{}{}-", prefix, component);
}
Ok(Path::new(&path).to_path_buf())
}

// get_cgroups_path generates a cgroups path from the one provided by the user via cgroupsPath.
// an example of the final path in rootless:
// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
fn get_cgroups_path(&self) -> Result<PathBuf> {
// the root slice is under 'machine.slice'.
let mut slice = Path::new("/machine.slice").to_path_buf();
// if the user provided a '.slice' (as in a branch of a tree)
// we need to "unpack it".
if !self.cgroups_path.parent.is_empty() {
slice = self.expand_slice(self.cgroups_path.parent.clone())?;
}
let unit_name = self.get_unit_name();
let cgroups_path = slice.join(unit_name);
// an example of the final path:
// "/sys/fs/cgroup/system.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
let full_path = self.root_path.join_absolute_path(&cgroups_path)?;
Ok(full_path)
}

/// create_unified_cgroup verifies sure that *each level* in the downward path from the root cgroup
/// down to the cgroup_path provided by the user is a valid cgroup hierarchy,
/// containing the attached controllers and that it contains the container pid.
fn create_unified_cgroup(&self, pid: Pid) -> Result<PathBuf> {
let cgroups_path = self.get_cgroups_path()?;
let controllers: Vec<String> = self
.get_available_controllers(common::DEFAULT_CGROUP_ROOT)?
.into_iter()
.map(|c| format!("{}{}", "+", c.to_string()))
.collect();

// Write the controllers to the root_path.
Self::write_controllers(&self.root_path, &controllers)?;

let mut current_path = self.root_path.clone();
let mut components = cgroups_path.components().skip(1).peekable();
// Verify that *each level* in the downward path from the root cgroup
// down to the cgroup_path provided by the user is a valid cgroup hierarchy.
// containing the attached controllers.
while let Some(component) = components.next() {
current_path = current_path.join(component);
if !current_path.exists() {
fs::create_dir(&current_path)?;
fs::metadata(&current_path)?.permissions().set_mode(0o755);
}

// last component cannot have subtree_control enabled due to internal process constraint
// if this were set, writing to the cgroups.procs file will fail with Erno 16 (device or resource busy)
if components.peek().is_some() {
Self::write_controllers(&current_path, &controllers)?;
}
}

write_cgroup_file(cgroups_path.join(CGROUP_PROCS), &pid.to_string())?;
Ok(cgroups_path)
}

fn get_available_controllers<P: AsRef<Path>>(
&self,
cgroups_path: P,
) -> Result<Vec<ControllerType>> {
let controllers_path = self.root_path.join(cgroups_path).join(CGROUP_CONTROLLERS);
if !controllers_path.exists() {
bail!(
"cannot get available controllers. {:?} does not exist",
controllers_path
)
}

let mut controllers = Vec::new();
for controller in fs::read_to_string(&controllers_path)?.split_whitespace() {
match controller {
"cpu" => controllers.push(ControllerType::Cpu),
"io" => controllers.push(ControllerType::Io),
"memory" => controllers.push(ControllerType::Memory),
"pids" => controllers.push(ControllerType::Pids),
_ => continue,
}
}

Ok(controllers)
}

fn write_controllers(path: &Path, controllers: &Vec<String>) -> Result<()> {
for controller in controllers {
common::write_cgroup_file_str(path.join(CGROUP_SUBTREE_CONTROL), controller)?;
}

Ok(())
}
}

impl CgroupManager for SystemDCGroupManager {
fn apply(&self, linux_resources: &LinuxResources, pid: Pid) -> Result<()> {
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid.as_raw() == -1 {
return Ok(());
}
let full_cgroup_path = self.create_unified_cgroup(pid)?;

for controller in CONTROLLER_TYPES {
match controller {
ControllerType::Cpu => Cpu::apply(linux_resources, &full_cgroup_path)?,
ControllerType::CpuSet => CpuSet::apply(linux_resources, &full_cgroup_path)?,
ControllerType::HugeTlb => HugeTlb::apply(linux_resources, &&full_cgroup_path)?,
ControllerType::Io => Io::apply(linux_resources, &&full_cgroup_path)?,
ControllerType::Memory => Memory::apply(linux_resources, &full_cgroup_path)?,
ControllerType::Pids => Pids::apply(linux_resources, &&full_cgroup_path)?,
}
}

Ok(())
}

fn remove(&self) -> Result<()> {
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn expand_slice_works() -> Result<()> {
let manager = SystemDCGroupManager::new(
Path::new("/sys/fs/cgroup").to_path_buf(),
Path::new("test-a-b.slice:docker:foo").to_path_buf(),
)?;

assert_eq!(
manager.expand_slice("test-a-b.slice".to_string())?,
Path::new("/test.slice/test-a.slice/test-a-b.slice").to_path_buf(),
);

Ok(())
}

#[test]
fn get_cgroups_path_works_with_a_complex_slice() -> Result<()> {
let manager = SystemDCGroupManager::new(
Path::new("/sys/fs/cgroup").to_path_buf(),
Path::new("test-a-b.slice:docker:foo").to_path_buf(),
)?;

assert_eq!(
manager.get_cgroups_path()?,
Path::new("/sys/fs/cgroup/test.slice/test-a.slice/test-a-b.slice/docker-foo.scope")
.to_path_buf(),
);

Ok(())
}

#[test]
fn get_cgroups_path_works_with_a_simple_slice() -> Result<()> {
let manager = SystemDCGroupManager::new(
Path::new("/sys/fs/cgroup").to_path_buf(),
Path::new("machine.slice:libpod:foo").to_path_buf(),
)?;

assert_eq!(
manager.get_cgroups_path()?,
Path::new("/sys/fs/cgroup/machine.slice/libpod-foo.scope").to_path_buf(),
);

Ok(())
}

#[test]
fn get_cgroups_path_works_with_scope() -> Result<()> {
let manager = SystemDCGroupManager::new(
Path::new("/sys/fs/cgroup").to_path_buf(),
Path::new(":docker:foo").to_path_buf(),
)?;

assert_eq!(
manager.get_cgroups_path()?,
Path::new("/sys/fs/cgroup/machine.slice/docker-foo.scope").to_path_buf(),
);

Ok(())
}
}
11 changes: 9 additions & 2 deletions src/create.rs
Original file line number Diff line number Diff line change
@@ -45,7 +45,12 @@ pub struct Create {
// associated with it like any other process.
impl Create {
/// Starts a new container process
pub fn exec(&self, root_path: PathBuf, command: impl Command) -> Result<()> {
pub fn exec(
&self,
root_path: PathBuf,
systemd_cgroup: bool,
command: impl Command,
) -> Result<()> {
// create a directory for the container to store state etc.
// if already present, return error
let bundle_canonicalized = fs::canonicalize(&self.bundle)
@@ -101,6 +106,7 @@ impl Create {
rootfs,
spec,
csocketfd,
systemd_cgroup,
container,
command,
)?;
@@ -120,6 +126,7 @@ fn run_container<P: AsRef<Path>>(
rootfs: PathBuf,
spec: oci_spec::Spec,
csocketfd: Option<FileDescriptor>,
systemd_cgroup: bool,
container: Container,
command: impl Command,
) -> Result<Process> {
@@ -132,7 +139,7 @@ fn run_container<P: AsRef<Path>>(
let namespaces: Namespaces = linux.namespaces.clone().into();

let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id());
let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?;
let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, systemd_cgroup)?;

// first fork, which creates process, which will later create actual container process
match fork::fork_first(
33 changes: 33 additions & 0 deletions src/dbus/client.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use anyhow::Result;
use dbus::blocking::Connection;
use std::time::Duration;
use std::vec::Vec;

/// Client is a wrapper providing higher level API and abatraction around dbus.
/// For more information see https://www.freedesktop.org/wiki/Software/systemd/dbus/
pub struct Client {
conn: Connection,
}

impl Client {
pub fn new() -> Result<Self> {
let conn = Connection::new_session()?;
Ok(Client { conn })
}

/// start_unit starts a specific unit under systemd. See https://www.freedesktop.org/wiki/Software/systemd/dbus
/// for more details.
pub fn start_unit(&self, unit_name: &str, _properties: Vec<&str>) -> Result<()> {
let proxy = self.conn.with_proxy(
"org.freedesktop.systemd1.Manager",
"/",
Duration::from_millis(5000),
);
let (_job_id,): (i32,) = proxy.method_call(
"org.freedesktop.systemd1.Manager",
"StartTransientUnit",
(unit_name, "replace"),
)?;
Ok(())
}
}
2 changes: 2 additions & 0 deletions src/dbus/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mod client;
pub use client::Client;
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ pub mod cgroups;
pub mod command;
pub mod container;
pub mod create;
pub mod dbus;
pub mod logger;
pub mod namespaces;
pub mod notify_socket;
13 changes: 11 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -31,6 +31,9 @@ struct Opts {
log: Option<PathBuf>,
#[clap(long)]
log_format: Option<String>,
/// Enable systemd cgroup manager, rather then use the cgroupfs directly.
#[clap(short, long)]
systemd_cgroup: bool,
/// command to actually manage container
#[clap(subcommand)]
subcmd: SubCommand,
@@ -45,6 +48,9 @@ pub struct Kill {
#[derive(Clap, Debug)]
pub struct Delete {
container_id: String,
// forces deletion of the container.
#[clap(short, long)]
force: bool,
}

#[derive(Clap, Debug)]
@@ -82,8 +88,10 @@ fn main() -> Result<()> {
let root_path = PathBuf::from(&opts.root);
fs::create_dir_all(&root_path)?;

let systemd_cgroup = opts.systemd_cgroup;

match opts.subcmd {
SubCommand::Create(create) => create.exec(root_path, LinuxCommand),
SubCommand::Create(create) => create.exec(root_path, systemd_cgroup, LinuxCommand),
SubCommand::Start(start) => start.exec(root_path),
SubCommand::Kill(kill) => {
// resolves relative paths, symbolic links etc. and get complete path
@@ -145,7 +153,8 @@ fn main() -> Result<()> {
// remove the cgroup created for the container
// check https://man7.org/linux/man-pages/man7/cgroups.7.html
// creating and removing cgroups section for more information on cgroups
let cmanager = cgroups::common::create_cgroup_manager(cgroups_path)?;
let cmanager =
cgroups::common::create_cgroup_manager(cgroups_path, systemd_cgroup)?;
cmanager.remove()?;
}
std::process::exit(0)

0 comments on commit 46edd7f

Please sign in to comment.