mirror of
https://github.com/netbirdio/gvisor.git
synced 2026-05-22 17:12:49 -07:00
Add documentation that clarifies systemd usage in runsc.
This change also transforms empty paths to match the spec described in the new docs and logs a warning if runsc detects a systemd-like path without `--systemd-cgroup`. Fixes #10264 PiperOrigin-RevId: 638743361
This commit is contained in:
committed by
gVisor bot
parent
a4349dab65
commit
8c3abba800
@@ -0,0 +1,92 @@
|
||||
# Systemd cgroup driver
|
||||
|
||||
By default, runsc creates cgroups and sets cgroup limits on its own (this mode
|
||||
is known as fs cgroup driver). When `--systemd-cgroup` global option is given
|
||||
(as in e.g. `runsc --systemd-cgroup run ...`), runsc switches to systemd cgroup
|
||||
driver. This document describes its features and peculiarities. Runsc requires
|
||||
the host systemd version to be at least 244 and for unified cgroups (aka
|
||||
cgroupv2) to be enabled.
|
||||
|
||||
### systemd unit name and placement
|
||||
|
||||
When creating a container, runsc requests systemd (over dbus) to create a
|
||||
transient unit for the container, and place it into a specified slice.
|
||||
|
||||
The name of the unit and the containing slice is derived from the container
|
||||
runtime spec in the following way:
|
||||
|
||||
1. If `Linux.CgroupsPath` is set, it is expected to be in the form
|
||||
`[slice]:[prefix]:[name]`.
|
||||
|
||||
Here `slice` is a systemd slice under which the container is placed. If
|
||||
empty, it defaults to `system.slice`, except when cgroup v2 is used and
|
||||
rootless container is created, in which case it defaults to `user.slice`.
|
||||
|
||||
Note that `slice` can contain dashes to denote a sub-slice (e.g.
|
||||
`user-1000.slice` is a correct notation, meaning a subslice of
|
||||
`user.slice`), but it must not contain slashes (e.g.
|
||||
`user.slice/user-1000.slice` is invalid).
|
||||
|
||||
A `slice` of `-` represents a root slice.
|
||||
|
||||
Next, `prefix` and `name` are used to compose the unit name, which is
|
||||
`<prefix>-<name>.scope`, unless `name` has `.slice` suffix, in which case
|
||||
`prefix` is ignored and the `name` is used as is.
|
||||
|
||||
2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it
|
||||
would be set to `:runsc:<container-id>`. See the description above to see
|
||||
what it transforms to.
|
||||
|
||||
As described above, a unit will be created as a systemd scope. For a scope,
|
||||
runsc specifies its parent slice via a *Slice=* systemd property, and also sets
|
||||
*Delegate=true*.
|
||||
|
||||
### Resource limits
|
||||
|
||||
runsc always enables accounting for all controllers, regardless of any limits
|
||||
being set. This means it unconditionally sets the following properties for the
|
||||
systemd unit being created:
|
||||
|
||||
* *CPUAccounting=true*
|
||||
* *IOAccounting=true*
|
||||
* *MemoryAccounting=true*
|
||||
* *TasksAccounting=true*
|
||||
|
||||
The resource limits of the systemd unit are set by runsc by translating the
|
||||
runtime spec resources to systemd unit properties.
|
||||
|
||||
Such translation is by no means complete, as there are some cgroup properties
|
||||
that can not be set via systemd. Therefore, runsc systemd cgroup driver is
|
||||
backed by fs driver (in other words, cgroup limits are first set via systemd
|
||||
unit properties, and when by writing to cgroupfs files).
|
||||
|
||||
The set of runtime spec resources which is translated by runsc to systemd unit
|
||||
properties depends on kernel cgroup version being used (v1 or v2), and on the
|
||||
systemd version being run. If an older systemd version (which does not support
|
||||
some resources) is used, runsc does not set those resources.
|
||||
|
||||
The following tables summarize which properties are translated.
|
||||
|
||||
runtime spec resource | systemd property name | min systemd version
|
||||
----------------------- | --------------------------- | -------------------
|
||||
memory.limit | MemoryMax |
|
||||
memory.reservation | MemoryLow |
|
||||
memory.swap | MemorySwapMax |
|
||||
cpu.shares | CPUWeight |
|
||||
pids.limit | TasksMax |
|
||||
cpu.cpus | AllowedCPUs |
|
||||
cpu.mems | AllowedMemoryNodes |
|
||||
unified.cpu.max | CPUQuota, CPUQuotaPeriodSec |
|
||||
unified.cpu.weight | CPUWeight |
|
||||
unified.cpu.idle | CPUWeight | v252
|
||||
unified.cpuset.cpus | AllowedCPUs |
|
||||
unified.cpuset.mems | AllowedMemoryNodes |
|
||||
unified.memory.high | MemoryHigh |
|
||||
unified.memory.low | MemoryLow |
|
||||
unified.memory.min | MemoryMin |
|
||||
unified.memory.max | MemoryMax |
|
||||
unified.memory.swap.max | MemorySwapMax |
|
||||
unified.pids.max | TasksMax |
|
||||
|
||||
For documentation on systemd unit resource properties, see
|
||||
`systemd.resource-control(5)` man page.
|
||||
+30
-12
@@ -342,18 +342,6 @@ type cgroupV1 struct {
|
||||
Own map[string]bool `json:"own"`
|
||||
}
|
||||
|
||||
// NewFromSpec creates a new Cgroup instance if the spec includes a cgroup path.
|
||||
// Returns nil otherwise. Cgroup paths are loaded based on the current process.
|
||||
// If useSystemd is true, the Cgroup will be created and managed with
|
||||
// systemd. This requires systemd (>=v244) to be running on the host and the
|
||||
// cgroup path to be in the form `slice:prefix:name`.
|
||||
func NewFromSpec(spec *specs.Spec, useSystemd bool) (Cgroup, error) {
|
||||
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
|
||||
return nil, nil
|
||||
}
|
||||
return NewFromPath(spec.Linux.CgroupsPath, useSystemd)
|
||||
}
|
||||
|
||||
// NewFromPath creates a new Cgroup instance from the specified relative path.
|
||||
// Cgroup paths are loaded based on the current process.
|
||||
// If useSystemd is true, the Cgroup will be created and managed with
|
||||
@@ -371,6 +359,36 @@ func NewFromPid(pid int, useSystemd bool) (Cgroup, error) {
|
||||
return new(strconv.Itoa(pid), "", useSystemd)
|
||||
}
|
||||
|
||||
// LikelySystemdPath returns true if the path looks like a systemd path. This is
|
||||
// by no means an exhaustive check, it's just a useful proxy for logging a
|
||||
// warning.
|
||||
func LikelySystemdPath(path string) bool {
|
||||
parts := strings.SplitN(path, ":", 4)
|
||||
return len(parts) == 3
|
||||
}
|
||||
|
||||
// TransformSystemdPath transforms systemd path to be in the form
|
||||
// `slice:prefix:name`. It returns an error if path could not be parsed as a
|
||||
// valid systemd path.
|
||||
func TransformSystemdPath(path, cid string, rootless bool) (string, error) {
|
||||
if len(path) == 0 {
|
||||
path = fmt.Sprintf(":runsc:%s", cid)
|
||||
}
|
||||
parts := strings.SplitN(path, ":", 4)
|
||||
if len(parts) != 3 {
|
||||
return "", fmt.Errorf("invalid systemd path: %q", path)
|
||||
}
|
||||
slice, prefix, name := parts[0], parts[1], parts[2]
|
||||
if len(slice) == 0 {
|
||||
if rootless {
|
||||
slice = "user.slice"
|
||||
} else {
|
||||
slice = "system.slice"
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("%s:%s:%s", slice, prefix, name), nil
|
||||
}
|
||||
|
||||
func new(pid, cgroupsPath string, useSystemd bool) (Cgroup, error) {
|
||||
var (
|
||||
parents map[string]string
|
||||
|
||||
@@ -1615,6 +1615,23 @@ func (c *Container) populateStats(event *boot.EventOut) {
|
||||
return
|
||||
}
|
||||
|
||||
func (c *Container) createParentCgroup(parentPath string, conf *config.Config) (cgroup.Cgroup, error) {
|
||||
var err error
|
||||
if conf.SystemdCgroup {
|
||||
parentPath, err = cgroup.TransformSystemdPath(parentPath, c.ID, conf.Rootless)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else if cgroup.LikelySystemdPath(parentPath) {
|
||||
log.Warningf("cgroup parent path is set to %q which looks like a systemd path. Please set --systemd-cgroup=true if you intend to use systemd to manage container cgroups", parentPath)
|
||||
}
|
||||
parentCgroup, err := cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parentCgroup, nil
|
||||
}
|
||||
|
||||
// setupCgroupForRoot configures and returns cgroup for the sandbox and the
|
||||
// root container. If `cgroupParentAnnotation` is set, use that path as the
|
||||
// sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup.
|
||||
@@ -1622,13 +1639,16 @@ func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (c
|
||||
var parentCgroup cgroup.Cgroup
|
||||
if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok {
|
||||
var err error
|
||||
parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
|
||||
parentCgroup, err = c.createParentCgroup(parentPath, conf)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
} else {
|
||||
var err error
|
||||
parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup)
|
||||
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
|
||||
return nil, nil, nil
|
||||
}
|
||||
parentCgroup, err = c.createParentCgroup(spec.Linux.CgroupsPath, conf)
|
||||
if parentCgroup == nil || err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
@@ -1659,7 +1679,10 @@ func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.
|
||||
}
|
||||
}
|
||||
|
||||
cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup)
|
||||
if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
|
||||
return nil, nil
|
||||
}
|
||||
cg, err := c.createParentCgroup(spec.Linux.CgroupsPath, conf)
|
||||
if cg == nil || err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user