Files
snapd/overlord/stateengine.go
Maciej Borzecki 9e2e5e206a o/snapstate: handling of unexpected runtime restart (#14002)
* o/snapstate: when invoked for rollback, check for any changes related to snapd

When the snapd.service unit fails to be activated, an OnFailure handler will
execute snap-failure which in turn starts the snapd process from the previous
revision of the snap with SNAPD_REVERT_TO_REV set in its environment. It may
happen that the snapd unit fails at runtime without an associated change to the
snapd snap, however snap-failure is not able to detect such case, and so the
snapd process started in its context would continue to run. Avoid this by
extending the logic within snapd to check it if has been started by snap-failure
with the intention of handling a rollback, and so whether there is a change
related to snapd snap in the state. When the conditions have not been met, snapd
exits and snap-failure continues to restart the snapd service.

Related issues: SNAPDENG-21605

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* tests/core/snapd-failover: account for improved snap-failure behavior

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* overlord: add Is() to startup error

Add Is() support to startupError, so that error can be introspected at runtime.

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* o/snapstate: return an explicit error from StartUp() when no recovery was detected

When snapd is invoked in a context of recovery but the state does not reflect
this, return an explicit error indicating that further startup should be carried out.

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* daemon: return an explicit error when startup in recovery context was aborted

Return an explicit error when startup in failure recovery context was aborted
due to lack of operations in the state which may have triggered it.

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* cmd/snapd: handle unnecessary failure recovery

Gracefully handle unnecessary failure recovery but exiting with 0 status, so
that snap-failure may continue with cleanup and restart.

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* o/snapstate: improve the check for asserting if restart was warranted

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* overlord: add managers test for handling of runtime restart with failure handling

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* many: tweak names and comments

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* o/snapstate: tweak unit test names

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* cmd/snapd: use fmt.Fprintln

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

* overlord/snapstate: tweak naming

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>

---------

Signed-off-by: Maciej Borzecki <maciej.borzecki@canonical.com>
2024-06-03 15:18:49 +02:00

206 lines
5.2 KiB
Go

// -*- Mode: Go; indent-tabs-mode: t -*-
/*
* Copyright (C) 2016 Canonical Ltd
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package overlord
import (
"errors"
"fmt"
"sync"
"github.com/snapcore/snapd/logger"
"github.com/snapcore/snapd/overlord/state"
)
// StateManager is implemented by types responsible for observing
// the system and manipulating it to reflect the desired state.
type StateManager interface {
// Ensure forces a complete evaluation of the current state.
// See StateEngine.Ensure for more details.
Ensure() error
}
// StateStarterUp is optionally implemented by StateManager that have expensive
// initialization to perform before the main Overlord loop.
type StateStarterUp interface {
// StartUp asks manager to perform any expensive initialization.
StartUp() error
}
// StateWaiter is optionally implemented by StateManagers that have running
// activities that can be waited.
type StateWaiter interface {
// Wait asks manager to wait for all running activities to
// finish.
Wait()
}
// StateStopper is optionally implemented by StateManagers that have
// running activities that can be terminated.
type StateStopper interface {
// Stop asks the manager to terminate all activities running
// concurrently. It must not return before these activities
// are finished.
Stop()
}
// StateEngine controls the dispatching of state changes to state managers.
//
// Most of the actual work performed by the state engine is in fact done
// by the individual managers registered. These managers must be able to
// cope with Ensure calls in any order, coordinating among themselves
// solely via the state.
type StateEngine struct {
state *state.State
stopped bool
startedUp bool
// managers in use
mgrLock sync.Mutex
managers []StateManager
}
// NewStateEngine returns a new state engine.
func NewStateEngine(s *state.State) *StateEngine {
return &StateEngine{
state: s,
}
}
// State returns the current system state.
func (se *StateEngine) State() *state.State {
return se.state
}
type startupError struct {
errs []error
}
func (e *startupError) Error() string {
return fmt.Sprintf("state startup errors: %v", e.errs)
}
// Is returns true if errors.Is(target) evaluates to true for any of the
// underlying startup errors.
func (e *startupError) Is(target error) bool {
for _, err := range e.errs {
if errors.Is(err, target) {
return true
}
}
return false
}
// StartUp asks all managers to perform any expensive initialization. It is a noop after the first invocation.
func (se *StateEngine) StartUp() error {
se.mgrLock.Lock()
defer se.mgrLock.Unlock()
if se.startedUp {
return nil
}
se.startedUp = true
var errs []error
for _, m := range se.managers {
if starterUp, ok := m.(StateStarterUp); ok {
err := starterUp.StartUp()
if err != nil {
errs = append(errs, err)
}
}
}
if len(errs) != 0 {
return &startupError{errs}
}
return nil
}
type ensureError struct {
errs []error
}
func (e *ensureError) Error() string {
return fmt.Sprintf("state ensure errors: %v", e.errs)
}
// Ensure asks every manager to ensure that they are doing the necessary
// work to put the current desired system state in place by calling their
// respective Ensure methods.
//
// Managers must evaluate the desired state completely when they receive
// that request, and report whether they found any critical issues. They
// must not perform long running activities during that operation, though.
// These should be performed in properly tracked changes and tasks.
func (se *StateEngine) Ensure() error {
se.mgrLock.Lock()
defer se.mgrLock.Unlock()
if !se.startedUp {
return fmt.Errorf("state engine skipped startup")
}
if se.stopped {
return fmt.Errorf("state engine already stopped")
}
var errs []error
for _, m := range se.managers {
err := m.Ensure()
if err != nil {
logger.Noticef("state ensure error: %v", err)
errs = append(errs, err)
}
}
if len(errs) != 0 {
return &ensureError{errs}
}
return nil
}
// AddManager adds the provided manager to take part in state operations.
func (se *StateEngine) AddManager(m StateManager) {
se.mgrLock.Lock()
defer se.mgrLock.Unlock()
se.managers = append(se.managers, m)
}
// Wait waits for all managers current activities.
func (se *StateEngine) Wait() {
se.mgrLock.Lock()
defer se.mgrLock.Unlock()
if se.stopped {
return
}
for _, m := range se.managers {
if waiter, ok := m.(StateWaiter); ok {
waiter.Wait()
}
}
}
// Stop asks all managers to terminate activities running concurrently.
func (se *StateEngine) Stop() {
se.mgrLock.Lock()
defer se.mgrLock.Unlock()
if se.stopped {
return
}
for _, m := range se.managers {
if stopper, ok := m.(StateStopper); ok {
stopper.Stop()
}
}
se.stopped = true
}