Add documentation that clarifies systemd usage in runsc.

This change also transforms empty paths to match the spec described in the new
docs and logs a warning if runsc detects a systemd-like path without
`--systemd-cgroup`.

Fixes #10264

PiperOrigin-RevId: 638743361
This commit is contained in:
Lucas Manning
2024-05-30 12:23:39 -07:00
committed by gVisor bot
parent a4349dab65
commit 8c3abba800
3 changed files with 148 additions and 15 deletions
+92
View File
@@ -0,0 +1,92 @@
# Systemd cgroup driver
By default, runsc creates cgroups and sets cgroup limits on its own (this mode
is known as fs cgroup driver). When `--systemd-cgroup` global option is given
(as in e.g. `runsc --systemd-cgroup run ...`), runsc switches to systemd cgroup
driver. This document describes its features and peculiarities. Runsc requires
the host systemd version to be at least 244 and for unified cgroups (aka
cgroupv2) to be enabled.
### systemd unit name and placement
When creating a container, runsc requests systemd (over dbus) to create a
transient unit for the container, and place it into a specified slice.
The name of the unit and the containing slice is derived from the container
runtime spec in the following way:
1. If `Linux.CgroupsPath` is set, it is expected to be in the form
`[slice]:[prefix]:[name]`.
Here `slice` is a systemd slice under which the container is placed. If
empty, it defaults to `system.slice`, except when cgroup v2 is used and
rootless container is created, in which case it defaults to `user.slice`.
Note that `slice` can contain dashes to denote a sub-slice (e.g.
`user-1000.slice` is a correct notation, meaning a subslice of
`user.slice`), but it must not contain slashes (e.g.
`user.slice/user-1000.slice` is invalid).
A `slice` of `-` represents a root slice.
Next, `prefix` and `name` are used to compose the unit name, which is
`<prefix>-<name>.scope`, unless `name` has `.slice` suffix, in which case
`prefix` is ignored and the `name` is used as is.
2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it
would be set to `:runsc:<container-id>`. See the description above to see
what it transforms to.
As described above, a unit will be created as a systemd scope. For a scope,
runsc specifies its parent slice via a *Slice=* systemd property, and also sets
*Delegate=true*.
### Resource limits
runsc always enables accounting for all controllers, regardless of any limits
being set. This means it unconditionally sets the following properties for the
systemd unit being created:
* *CPUAccounting=true*
* *IOAccounting=true*
* *MemoryAccounting=true*
* *TasksAccounting=true*
The resource limits of the systemd unit are set by runsc by translating the
runtime spec resources to systemd unit properties.
Such translation is by no means complete, as there are some cgroup properties
that can not be set via systemd. Therefore, runsc systemd cgroup driver is
backed by fs driver (in other words, cgroup limits are first set via systemd
unit properties, and when by writing to cgroupfs files).
The set of runtime spec resources which is translated by runsc to systemd unit
properties depends on kernel cgroup version being used (v1 or v2), and on the
systemd version being run. If an older systemd version (which does not support
some resources) is used, runsc does not set those resources.
The following tables summarize which properties are translated.
runtime spec resource | systemd property name | min systemd version
----------------------- | --------------------------- | -------------------
memory.limit | MemoryMax |
memory.reservation | MemoryLow |
memory.swap | MemorySwapMax |
cpu.shares | CPUWeight |
pids.limit | TasksMax |
cpu.cpus | AllowedCPUs |
cpu.mems | AllowedMemoryNodes |
unified.cpu.max | CPUQuota, CPUQuotaPeriodSec |
unified.cpu.weight | CPUWeight |
unified.cpu.idle | CPUWeight | v252
unified.cpuset.cpus | AllowedCPUs |
unified.cpuset.mems | AllowedMemoryNodes |
unified.memory.high | MemoryHigh |
unified.memory.low | MemoryLow |
unified.memory.min | MemoryMin |
unified.memory.max | MemoryMax |
unified.memory.swap.max | MemorySwapMax |
unified.pids.max | TasksMax |
For documentation on systemd unit resource properties, see
`systemd.resource-control(5)` man page.
+30 -12
View File
@@ -342,18 +342,6 @@ type cgroupV1 struct {
Own map[string]bool `json:"own"`
}
// NewFromSpec creates a new Cgroup instance if the spec includes a cgroup path.
// Returns nil otherwise. Cgroup paths are loaded based on the current process.
// If useSystemd is true, the Cgroup will be created and managed with
// systemd. This requires systemd (>=v244) to be running on the host and the
// cgroup path to be in the form `slice:prefix:name`.
func NewFromSpec(spec *specs.Spec, useSystemd bool) (Cgroup, error) {
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
return nil, nil
}
return NewFromPath(spec.Linux.CgroupsPath, useSystemd)
}
// NewFromPath creates a new Cgroup instance from the specified relative path.
// Cgroup paths are loaded based on the current process.
// If useSystemd is true, the Cgroup will be created and managed with
@@ -371,6 +359,36 @@ func NewFromPid(pid int, useSystemd bool) (Cgroup, error) {
return new(strconv.Itoa(pid), "", useSystemd)
}
// LikelySystemdPath returns true if the path looks like a systemd path. This is
// by no means an exhaustive check, it's just a useful proxy for logging a
// warning.
func LikelySystemdPath(path string) bool {
parts := strings.SplitN(path, ":", 4)
return len(parts) == 3
}
// TransformSystemdPath transforms systemd path to be in the form
// `slice:prefix:name`. It returns an error if path could not be parsed as a
// valid systemd path.
func TransformSystemdPath(path, cid string, rootless bool) (string, error) {
if len(path) == 0 {
path = fmt.Sprintf(":runsc:%s", cid)
}
parts := strings.SplitN(path, ":", 4)
if len(parts) != 3 {
return "", fmt.Errorf("invalid systemd path: %q", path)
}
slice, prefix, name := parts[0], parts[1], parts[2]
if len(slice) == 0 {
if rootless {
slice = "user.slice"
} else {
slice = "system.slice"
}
}
return fmt.Sprintf("%s:%s:%s", slice, prefix, name), nil
}
func new(pid, cgroupsPath string, useSystemd bool) (Cgroup, error) {
var (
parents map[string]string
+26 -3
View File
@@ -1615,6 +1615,23 @@ func (c *Container) populateStats(event *boot.EventOut) {
return
}
func (c *Container) createParentCgroup(parentPath string, conf *config.Config) (cgroup.Cgroup, error) {
var err error
if conf.SystemdCgroup {
parentPath, err = cgroup.TransformSystemdPath(parentPath, c.ID, conf.Rootless)
if err != nil {
return nil, err
}
} else if cgroup.LikelySystemdPath(parentPath) {
log.Warningf("cgroup parent path is set to %q which looks like a systemd path. Please set --systemd-cgroup=true if you intend to use systemd to manage container cgroups", parentPath)
}
parentCgroup, err := cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
if err != nil {
return nil, err
}
return parentCgroup, nil
}
// setupCgroupForRoot configures and returns cgroup for the sandbox and the
// root container. If `cgroupParentAnnotation` is set, use that path as the
// sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup.
@@ -1622,13 +1639,16 @@ func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (c
var parentCgroup cgroup.Cgroup
if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok {
var err error
parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
parentCgroup, err = c.createParentCgroup(parentPath, conf)
if err != nil {
return nil, nil, err
}
} else {
var err error
parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup)
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
return nil, nil, nil
}
parentCgroup, err = c.createParentCgroup(spec.Linux.CgroupsPath, conf)
if parentCgroup == nil || err != nil {
return nil, nil, err
}
@@ -1659,7 +1679,10 @@ func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.
}
}
cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup)
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
return nil, nil
}
cg, err := c.createParentCgroup(spec.Linux.CgroupsPath, conf)
if cg == nil || err != nil {
return nil, err
}