Merge pull request #10425 from anonymouse64/bugfix/quota-groups-pour-one-out-for-xenial

o/servicestate, snap/quota: eliminate workaround for buggy systemds, add spread test
This commit is contained in:
Michael Vogt
2021-06-21 09:37:25 +02:00
committed by GitHub
13 changed files with 120 additions and 98 deletions

View File

@@ -66,8 +66,8 @@ func MockSystemdVersion(vers int) (restore func()) {
func quotaGroupsAvailable(st *state.State) error {
// check if the systemd version is too old
if systemdVersion < 205 {
return fmt.Errorf("systemd version too old: snap quotas requires systemd 205 and newer (currently have %d)", systemdVersion)
if systemdVersion < 230 {
return fmt.Errorf("systemd version too old: snap quotas requires systemd 230 and newer (currently have %d)", systemdVersion)
}
tr := config.NewTransaction(st)

View File

@@ -208,14 +208,14 @@ func (s *quotaControlSuite) TestCreateQuotaSystemdTooOld(c *C) {
s.state.Lock()
defer s.state.Unlock()
r := s.mockSystemctlCalls(c, systemctlCallsVersion(204))
r := s.mockSystemctlCalls(c, systemctlCallsVersion(229))
defer r()
err := servicestate.CheckSystemdVersion()
c.Assert(err, IsNil)
err = servicestate.CreateQuota(s.state, "foo", "", nil, quantity.SizeGiB)
c.Assert(err, ErrorMatches, `systemd version too old: snap quotas requires systemd 205 and newer \(currently have 204\)`)
c.Assert(err, ErrorMatches, `systemd version too old: snap quotas requires systemd 230 and newer \(currently have 229\)`)
}
func (s *quotaControlSuite) TestRemoveQuotaPreseeding(c *C) {

View File

@@ -30,8 +30,9 @@ if [ "$1" = "purge" ]; then
units=$(systemctl list-unit-files --full | grep '^snap[-.]' | cut -f1 -d ' ' | grep -vF snap.mount.service || true)
mounts=$(echo "$units" | grep '^snap[-.].*\.mount$' || true)
services=$(echo "$units" | grep '^snap[-.].*\.service$' || true)
slices=$(echo "$units" | grep '^snap[-.].*\.slice$' || true)
for unit in $services $mounts; do
for unit in $services $mounts $slices; do
# ensure its really a snap mount unit or systemd unit
if ! grep -q 'What=/var/lib/snapd/snaps/' "/etc/systemd/system/$unit" && ! grep -q 'X-Snappy=yes' "/etc/systemd/system/$unit"; then
echo "Skipping non-snapd systemd unit $unit"

View File

@@ -30,8 +30,9 @@ if [ "$1" = "purge" ]; then
units=$(systemctl list-unit-files --full | grep '^snap[-.]' | cut -f1 -d ' ' | grep -vF snap.mount.service || true)
mounts=$(echo "$units" | grep '^snap[-.].*\.mount$' || true)
services=$(echo "$units" | grep '^snap[-.].*\.service$' || true)
slices=$(echo "$units" | grep '^snap[-.].*\.slice$' || true)
for unit in $services $mounts; do
for unit in $services $mounts $slices; do
# ensure its really a snap mount unit or systemd unit
if ! grep -q 'What=/var/lib/snapd/snaps/' "/etc/systemd/system/$unit" && ! grep -q 'X-Snappy=yes' "/etc/systemd/system/$unit"; then
echo "Skipping non-snapd systemd unit $unit"

View File

@@ -86,9 +86,6 @@ func NewGroup(name string, memLimit quantity.Size) (*Group, error) {
return grp, nil
}
// special value that systemd sometimes returns in buggy situations
const sixteenExb = quantity.Size(1<<64 - 1)
// CurrentMemoryUsage returns the current memory usage of the quota group. For
// quota groups which do not yet have a backing systemd slice on the system (
// i.e. quota groups without any snaps in them), the memory usage is reported as
@@ -106,33 +103,11 @@ func (grp *Group) CurrentMemoryUsage() (quantity.Size, error) {
return 0, nil
}
// we also check how many tasks are in the group as this could indicate a
// bug we need to work around on older systems - specifically if there are
// no tasks in the cgroup, we could erroneously get the current memory usage
// from systemd back as 16 exbibytes (which is actually the max size of a
// 64-bit unsigned integer), and in this case we report the usage as 0
// instead.
// note that we specifically _don't_ treat 0 tasks in the group as implying
// that the usage is 0, since on some newer systems, a cgroup with 0 tasks
// but still active will have 4K memory usage always, so we only correct
// the memory usage when it is 16 exb and when there are no tasks in the
// group
numTasks, err := sysd.CurrentTasksCount(grp.SliceFileName())
if err != nil {
return 0, err
}
mem, err := sysd.CurrentMemoryUsage(grp.SliceFileName())
if err != nil {
return 0, err
}
if mem == sixteenExb && numTasks == 0 {
// this is the bug situation, return the memory as 0
mem = 0
}
return mem, nil
}

View File

@@ -671,57 +671,33 @@ func (ts *quotaTestSuite) TestCurrentMemoryUsage(c *C) {
c.Assert(args, DeepEquals, []string{"is-active", "snap.group.slice"})
return []byte("active"), nil
case 3:
// since it is active, we will query the tasks count too
c.Assert(args, DeepEquals, []string{"show", "--property", "TasksCurrent", "snap.group.slice"})
return []byte("TasksCurrent=0"), nil
case 4:
// and the memory count can be non-zero like
c.Assert(args, DeepEquals, []string{"show", "--property", "MemoryCurrent", "snap.group.slice"})
return []byte("MemoryCurrent=4096"), nil
// normal case, has tasks + memory usage
case 5:
// now pretend it is active again - third time is the charm
case 4:
// now pretend it is active
c.Assert(args, DeepEquals, []string{"is-active", "snap.group.slice"})
return []byte("active"), nil
case 6:
// since it is active, we will query the tasks count too
c.Assert(args, DeepEquals, []string{"show", "--property", "TasksCurrent", "snap.group.slice"})
return []byte("TasksCurrent=1"), nil
case 7:
// since it is active, we will query the current memory usage
case 5:
// and the memory count can be zero too
c.Assert(args, DeepEquals, []string{"show", "--property", "MemoryCurrent", "snap.group.slice"})
return []byte("MemoryCurrent=1024"), nil
return []byte("MemoryCurrent=0"), nil
// bug case where 16 exb is erroneous
case 8:
// bug case where 16 exb is erroneous - this is left in for posterity,
// but we don't handle this differently, previously we had a workaround
// for this sort of case, but it ended up not being tenable but still
// test that a huge value just gets reported as-is
case 6:
// the cgroup is active, has no tasks and has 16 exb usage
c.Assert(args, DeepEquals, []string{"is-active", "snap.group.slice"})
return []byte("active"), nil
case 9:
// since it is active, we will query the tasks count too
c.Assert(args, DeepEquals, []string{"show", "--property", "TasksCurrent", "snap.group.slice"})
return []byte("TasksCurrent=0"), nil
case 10:
case 7:
// since it is active, we will query the current memory usage,
// this time return an obviously wrong number
c.Assert(args, DeepEquals, []string{"show", "--property", "MemoryCurrent", "snap.group.slice"})
return []byte("MemoryCurrent=18446744073709551615"), nil
// not bug case where 16 exb is real usage
case 11:
// not the bug case, the cgroup is active, has tasks and has 16 exb
// usage but we treat it as normal and report 16 exb
c.Assert(args, DeepEquals, []string{"is-active", "snap.group.slice"})
return []byte("active"), nil
case 12:
// since it is active, we will query the tasks count too
c.Assert(args, DeepEquals, []string{"show", "--property", "TasksCurrent", "snap.group.slice"})
return []byte("TasksCurrent=1"), nil
case 13:
// since it is active, we will query the current memory usage
c.Assert(args, DeepEquals, []string{"show", "--property", "MemoryCurrent", "snap.group.slice"})
return []byte("MemoryCurrent=18446744073709551615"), nil
default:
c.Errorf("too many systemctl calls (%d) (current call is %+v)", systemctlCalls, args)
return []byte("broken test"), fmt.Errorf("broken test")
@@ -737,24 +713,17 @@ func (ts *quotaTestSuite) TestCurrentMemoryUsage(c *C) {
c.Assert(err, IsNil)
c.Assert(currentMem, Equals, quantity.Size(0))
// now with systemctl mocked as active but no tasks in the group, it still
// can have memory usage
// now with the slice mocked as active it has real usage
currentMem, err = grp1.CurrentMemoryUsage()
c.Assert(err, IsNil)
c.Assert(currentMem, Equals, 4*quantity.SizeKiB)
// with tasks in the group and a normal memory value, we just report back
// the memory value
currentMem, err = grp1.CurrentMemoryUsage()
c.Assert(err, IsNil)
c.Assert(currentMem, Equals, quantity.SizeKiB)
// with no tasks and the special 16 exb value reported by systemd we get 0
// but it can also have 0 usage
currentMem, err = grp1.CurrentMemoryUsage()
c.Assert(err, IsNil)
c.Assert(currentMem, Equals, quantity.Size(0))
// but the special value with tasks in the group reports the huge value
// and it can also be an incredibly huge value too
currentMem, err = grp1.CurrentMemoryUsage()
c.Assert(err, IsNil)
const sixteenExb = quantity.Size(1<<64 - 1)

View File

@@ -4,7 +4,7 @@ show_help() {
echo "usage: is-core, is-classic"
echo " is-core16, is-core18, is-core20"
echo " is-trusty, is-xenial, is-bionic, is-focal, is-groovy, is-hirsute"
echo " is-ubuntu, is-debian, is-fedora, is-amazon-linux, is-arch-linux, is-centos, is-opensuse"
echo " is-ubuntu, is-debian, is-fedora, is-amazon-linux, is-arch-linux, is-centos, is-centos-7, is-opensuse"
echo " is-opensuse-tumbleweed, is-debian-sid, is-debian-10"
echo " is-pc-amd64, is-pc-i386, is-arm, is-armhf, is-arm64"
echo ""
@@ -83,6 +83,10 @@ is_centos() {
grep -qFx 'ID="centos"' /etc/os-release
}
is_centos_7() {
grep -qFx 'ID="centos"' /etc/os-release && grep -qFx 'VERSION_ID="7"' /etc/os-release
}
is_arch_linux() {
grep -qFx 'ID=arch' /etc/os-release
}

View File

@@ -129,4 +129,7 @@ execute: |
debian-sid-*)
os.query is-debian-sid
;;
centos-7-*)
os.query is-centos-7
;;
esac

View File

@@ -33,7 +33,7 @@ prepare: |
snap install --edge test-snapd-dbus-provider
snap list | MATCH test-snapd-dbus-provider
if ! os.query is-trusty; then
if ! os.query is-trusty && ! os.query is-amazon-linux && ! os.query is-centos-7 && ! os.query is-xenial; then
# create a quota with a service in it
snap set-quota group1 test-snapd-service --memory=100MB
fi
@@ -51,6 +51,10 @@ prepare: |
test "$(find /etc/systemd/user -name 'snap.*.socket' | wc -l)" -gt 0
fi
if ! os.query is-trusty && ! os.query is-amazon-linux && ! os.query is-centos-7 && ! os.query is-xenial; then
test "$(find /etc/systemd -name 'snap.*.slice' | wc -l)" -gt 0
fi
restore: |
#shellcheck source=tests/lib/pkgdb.sh
. "$TESTSLIB/pkgdb.sh"
@@ -119,3 +123,4 @@ execute: |
test "$(find /etc/systemd/user -name 'snap.*.timer' | wc -l)" -eq 0
test "$(find /etc/systemd/user -name 'snap.*.socket' | wc -l)" -eq 0
fi
test "$(find /etc/systemd -name 'snap.*.slice' | wc -l)" -eq 0

View File

@@ -0,0 +1,59 @@
summary: Test to ensure that quota groups and systemd accounting is effective.
details: |
Some older systemd's such as that of xenial, Amazon Linux 2, and Centos 7
suffer from an unfortunate bug wherein the accounting for memory and tasks
becomes broken and returns the maximum value for a 64-bit integer instead of
the real value. This test checks that supported systems work and does not
trigger the accounting bug. For systems that have the bug, we check that they
are not supported in the tests/main/snap-quota test.
# these systems do not support quota groups due to their old systemd versions,
# we do have a check to do for these systems, but that is just to ensure that
# we don't allow using quota groups, and that check is done in the snap-quota
# spread instead
systems:
- -ubuntu-14.04-*
- -centos-7-*
- -amazon-linux-2-*
- -ubuntu-16.04-*
- -ubuntu-core-16-*
prepare: |
snap install hello-world go-example-webserver remarshal jq
snap set system experimental.quota-groups=true
tests.cleanup defer snap unset system experimental.quota-groups
execute: |
# the bug mainly happens when we create a quota group with nothing in it,
# then systemd reports using a huge amount of memory or tasks
# create a root group that will be the top of the tree for all sub-groups
snap set-quota --memory=400MB top
# create a sub-group which will actually have a service in it and thus usage
snap set-quota --memory=100MB --parent=top sub go-example-webserver
# create another sub-group which will serve as the poisoned branch on buggy
# systems
snap set-quota --memory=5000B --parent=top sub2
# now create a quota group under sub2 with no services in it, but still has a
# snap in it so that the systemd slice underlying the quota group is generated
# and exists
snap set-quota --memory=4900B --parent=sub2 sub21 hello-world
# check that usage is still sensible for the groups that have real usage -
# here 18.4EB is how we format the maximum size of a 64-bit unsigned integer
# that systemd returns
# TODO: change this to use the no unit flag when that is a thing so we can
# compare to actual value
snap quota sub | yaml2json | jq -r '.current.memory' | NOMATCH "18.4EB"
snap quota top | yaml2json | jq -r '.current.memory' | NOMATCH "18.4EB"
# now trigger the bug by removing the sub21 group
snap remove-quota sub21
# usage should still be sensible
snap quota sub | yaml2json | jq -r '.current.memory' | NOMATCH "18.4EB"
snap quota top | yaml2json | jq -r '.current.memory' | NOMATCH "18.4EB"

View File

@@ -22,7 +22,7 @@ prepare: |
test-snapd-user-timer-service test-snapd-tools \
test-snapd-control-consumer test-snapd-auto-aliases \
test-snapd-kvm ; do
if echo "$name" | grep -q user && echo "$SPREAD_SYSTEM" | grep -qF ubuntu-14.04; then
if echo "$name" | grep -q user && os.query is-trusty; then
# None of the "user" snaps work on 14.04
continue
fi
@@ -36,10 +36,12 @@ prepare: |
snap install --edge test-snapd-dbus-provider
snap list | MATCH test-snapd-dbus-provider
if echo "$SPREAD_SYSTEM" | grep -vqF ubuntu-14.04; then
if ! os.query is-trusty; then
snap install --edge test-snapd-dbus-service
snap list | MATCH test-snapd-dbus-service
fi
if ! os.query is-trusty && ! os.query is-amazon-linux && ! os.query is-centos-7 && ! os.query is-xenial; then
# create a quota with a service in it
snap set-quota group1 test-snapd-service --memory=100MB
fi
@@ -61,12 +63,14 @@ prepare: |
test "$(find /etc/systemd/system -name 'snap.*.service' | wc -l)" -gt 0
test "$(find /etc/systemd/system -name 'snap.*.timer' | wc -l)" -gt 0
test "$(find /etc/systemd/system -name 'snap.*.socket' | wc -l)" -gt 0
if echo "$SPREAD_SYSTEM" | grep -vqF ubuntu-14.04; then
if ! os.query is-trusty; then
test "$(find /etc/systemd/user -name 'snap.*.service' | wc -l)" -gt 0
test "$(find /etc/systemd/user -name 'snap.*.timer' | wc -l)" -gt 0
test "$(find /etc/systemd/user -name 'snap.*.socket' | wc -l)" -gt 0
test "$(find /var/lib/snapd/dbus-1/services -name '*.service' | wc -l)" -gt 0
test "$(find /var/lib/snapd/dbus-1/system-services -name '*.service' | wc -l)" -gt 0
fi
if ! os.query is-trusty && ! os.query is-amazon-linux && ! os.query is-centos-7 && ! os.query is-xenial; then
test "$(find /etc/systemd/system -name 'snap.*.slice' | wc -l)" -gt 0
fi
@@ -118,7 +122,7 @@ execute: |
test -z "$(find /etc/systemd/system/multi-user.target.wants/ -name 'snap.test-snapd-service.*')"
test -z "$(find /etc/systemd/system/sockets.target.wants/ -name 'snap.*')"
test -z "$(find /etc/systemd/system/timers.target.wants/ -name 'snap.*')"
if echo "$SPREAD_SYSTEM" | grep -vqF ubuntu-14.04; then
if ! os.query is-trusty; then
test -z "$(find /etc/systemd/user/default.target.wants/ -name 'snap.*')"
test -z "$(find /etc/systemd/user/sockets.target.wants/ -name 'snap.*')"
test -z "$(find /etc/systemd/user/timers.target.wants/ -name 'snap.*')"
@@ -130,7 +134,7 @@ execute: |
test "$(find /etc/systemd/system -name 'snap.*.service' -a ! -name "snap.mount.service" | wc -l)" -eq 0
test "$(find /etc/systemd/system -name 'snap.*.timer' | wc -l)" -eq 0
test "$(find /etc/systemd/system -name 'snap.*.socket' | wc -l)" -eq 0
if echo "$SPREAD_SYSTEM" | grep -vqF ubuntu-14.04; then
if ! os.query is-trusty; then
test "$(find /etc/systemd/user -name 'snap.*.service' | wc -l)" -eq 0
test "$(find /etc/systemd/user -name 'snap.*.timer' | wc -l)" -eq 0
test "$(find /etc/systemd/user -name 'snap.*.socket' | wc -l)" -eq 0

View File

@@ -4,23 +4,23 @@ details: |
Functional test for snap quota group commands ensuring that they are
effective in practice.
# these systems do not support quota groups due to their old systemd versions,
# we do have a check to do for these systems, but that is just to ensure that
# we don't allow using quota groups, and that check is done in the snap-quota
# spread instead
systems:
- -ubuntu-14.04-*
- -centos-7-*
- -amazon-linux-2-*
- -ubuntu-16.04-*
- -ubuntu-core-16-*
prepare: |
if os.query is-trusty; then
exit 0
fi
snap install go-example-webserver jq remarshal http hello-world
snap set system experimental.quota-groups=true
tests.cleanup defer snap unset system experimental.quota-groups
execute: |
if os.query is-trusty; then
# just check that we can't do anything with quota groups on trusty, systemd
# there is 204, but we need at least 205 for slice units
snap set-quota foobar --memory=1MB 2>&1 | MATCH "systemd version too old: snap quotas requires systemd 205 and newer \(currently have 204\)"
exit 0
fi
echo "Create a group with a snap in it"
snap set-quota group-one --memory=100MB go-example-webserver

View File

@@ -9,11 +9,12 @@ prepare: |
tests.cleanup defer snap unset system experimental.quota-groups
execute: |
if os.query is-trusty; then
# just check that we can't do anything with quota groups on trusty, systemd
# there is 204, but we need at least 205 for slice units
if os.query is-trusty || os.query is-amazon-linux || os.query is-centos-7 || os.query is-xenial || os.query is-core16; then
# just check that we can't do anything with quota groups on systems with
# old systemd versions, we need at least 230 to avoid buggy slice usage
# reporting
snap set-quota foobar --memory=1MB 2>&1 | MATCH "systemd version too old: snap quotas requires systemd 205 and newer \(currently have 204\)"
snap set-quota foobar --memory=1MB 2>&1 | MATCH "systemd version too old: snap quotas requires systemd 230 and newer \(currently have 2[0-2][0-9]\)"
exit 0
fi