Skip to content

Commit

Permalink
system.reboot.info
Browse files Browse the repository at this point in the history
  • Loading branch information
themylogin committed Sep 17, 2024
1 parent 43a8a7a commit 79d4b32
Show file tree
Hide file tree
Showing 9 changed files with 215 additions and 158 deletions.
1 change: 1 addition & 0 deletions src/middlewared/middlewared/api/v25_04_0/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
from .failover_reboot import * # noqa
from .group import * # noqa
from .privilege import * # noqa
from .system_reboot import * # noqa
from .user import * # noqa
from .vendor import * # noqa
25 changes: 12 additions & 13 deletions src/middlewared/middlewared/api/v25_04_0/failover_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,25 @@
# Licensed under the terms of the TrueNAS Enterprise License Agreement
# See the file LICENSE.IX for complete terms and conditions
from middlewared.api.base import BaseModel, single_argument_result
from .system_reboot import SystemRebootInfoResult

__all__ = ["FailoverRebootRequiredArgs", "FailoverRebootRequiredResult"]
__all__ = ["FailoverRebootInfoArgs", "FailoverRebootInfoResult",
"FailoverRebootOtherNodeArgs", "FailoverRebootOtherNodeResult"]


class FailoverRebootRequiredArgs(BaseModel):
class FailoverRebootInfoArgs(BaseModel):
pass


class FailoverRebootRequiredResultThisNode(BaseModel):
id: str
reboot_required: bool
reboot_required_reasons: list[str]
@single_argument_result
class FailoverRebootInfoResult(BaseModel):
this_node: SystemRebootInfoResult
other_node: SystemRebootInfoResult | None


class FailoverRebootRequiredResultOtherNode(FailoverRebootRequiredResultThisNode):
id: str | None
reboot_required: bool | None
class FailoverRebootOtherNodeArgs(BaseModel):
pass


@single_argument_result
class FailoverRebootRequiredResult(BaseModel):
this_node: FailoverRebootRequiredResultThisNode
other_node: FailoverRebootRequiredResultOtherNode
class FailoverRebootOtherNodeResult(BaseModel):
result: None
18 changes: 18 additions & 0 deletions src/middlewared/middlewared/api/v25_04_0/system_reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from middlewared.api.base import BaseModel, single_argument_result

__all__ = ["SystemRebootInfoArgs", "SystemRebootInfoResult"]


class SystemRebootInfoArgs(BaseModel):
pass


class RebootRequiredReason(BaseModel):
code: str
reason: str


@single_argument_result
class SystemRebootInfoResult(BaseModel):
boot_id: str
reboot_required_reasons: list[RebootRequiredReason]
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ def get_local_reasons(self, app, ifaces, reasons):
return

reboot_info = self.middleware.call_sync('failover.reboot.info')
if reboot_info['this_node']['reboot_required']:
if reboot_info['this_node']['reboot_required_reasons']:
reasons.add(DisabledReasonsEnum.LOC_FIPS_REBOOT_REQ.name)
if reboot_info['other_node']['reboot_required']:
if reboot_info['other_node'] is not None and reboot_info['other_node']['reboot_required_reasons']:
reasons.add(DisabledReasonsEnum.REM_FIPS_REBOOT_REQ.name)

if self.SYSTEM_DATASET_SETUP_IN_PROGRESS:
Expand Down
162 changes: 92 additions & 70 deletions src/middlewared/middlewared/plugins/failover_/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,23 @@
# Licensed under the terms of the TrueNAS Enterprise License Agreement
# See the file LICENSE.IX for complete terms and conditions
import asyncio
from dataclasses import dataclass
import errno
import time

from middlewared.api import api_method
from middlewared.api.current import FailoverRebootRequiredArgs, FailoverRebootRequiredResult
from middlewared.schema import accepts, Bool, Dict, UUID, returns
from middlewared.api.current import (
FailoverRebootInfoArgs, FailoverRebootInfoResult,
FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult,
)
from middlewared.service import CallError, job, private, Service

FIPS_KEY = 'fips_toggled'

@dataclass
class RemoteRebootReason:
# Boot ID for which the reboot is required. `None` means that the system must be rebooted when it comes online.
boot_id: str | None
reason: str


class FailoverRebootService(Service):
Expand All @@ -20,83 +28,74 @@ class Config:
cli_namespace = 'system.failover.reboot'
namespace = 'failover.reboot'

reboot_reasons : dict[str, str] = {}
remote_reboot_reasons: dict[str, RemoteRebootReason] = {}

@private
async def add_reason(self, key: str, value: str):
async def add_remote_reason(self, code: str, reason: str):
"""
Adds a reason on why this system needs a reboot.
:param key: unique identifier for the reason.
:param value: text explanation for the reason.
Adds a reason for why the remote system needs a reboot.
This will be appended to the list of the reasons that the remote node itself returns.
:param code: unique identifier for the reason.
:param reason: text explanation for the reason.
"""
self.reboot_reasons[key] = value
try:
boot_id = await self.middleware.call('failover.call_remote', 'system.boot_id', [], {
'raise_connect_error': False,
'timeout': 2,
'connect_timeout': 2,
})
except Exception:
self.logger.warning('Unexpected error querying remote reboot boot id', exc_info=True)
# Remote system is inaccessible, so, when it comes back, another reboot will be required.
boot_id = None

self.remote_reboot_reasons[code] = RemoteRebootReason(boot_id, reason)

await self.send_event()

@api_method(FailoverRebootInfoArgs, FailoverRebootInfoResult, roles=['FAILOVER_READ'])
async def info(self):
changed = False

@private
async def boot_ids(self):
info = {
'this_node': {'id': await self.middleware.call('system.boot_id')},
'other_node': {'id': None}
}
try:
info['other_node']['id'] = await self.middleware.call(
'failover.call_remote', 'system.boot_id', [], {
'raise_connect_error': False, 'timeout': 2, 'connect_timeout': 2,
}
)
other_node = await self.middleware.call('failover.call_remote', 'system.reboot.info', [], {
'raise_connect_error': False,
'timeout': 2,
'connect_timeout': 2,
})
except Exception:
self.logger.warning('Unexpected error querying remote node boot id', exc_info=True)
self.logger.warning('Unexpected error querying remote reboot info', exc_info=True)
other_node = None

if other_node is not None:
for remote_reboot_reason_code, remote_reboot_reason in list(self.remote_reboot_reasons.items()):
if remote_reboot_reason.boot_id is None:
# This reboot reason was added while the remote node was not functional.
# In that case, when the remote system comes online, an additional reboot is required.
remote_reboot_reason.boot_id = other_node['boot_id']
changed = True

if remote_reboot_reason.boot_id == other_node['boot_id']:
other_node['reboot_required_reasons'].append({
'code': remote_reboot_reason_code,
'reason': remote_reboot_reason.reason,
})
else:
# The system was rebooted, this reason is not valid anymore
self.remote_reboot_reasons.pop(remote_reboot_reason_code)
changed = True

info = {
'this_node': await self.middleware.call('system.reboot.info'),
'other_node': other_node,
}

if changed:
await self.send_event(info)

return info

@private
async def info_impl(self):
# initial state
current_info = await self.boot_ids()
current_info['this_node'].update({'reboot_required': None, 'reboot_required_reasons': []})
current_info['other_node'].update({'reboot_required': None, 'reboot_required_reasons': []})

fips_change_info = await self.middleware.call('keyvalue.get', FIPS_KEY, False)
if not fips_change_info:
for i in current_info:
current_info[i]['reboot_required'] = False
else:
for key in ('this_node', 'other_node'):
current_info[key]['reboot_required'] = all((
fips_change_info[key]['id'] == current_info[key]['id'],
fips_change_info[key]['reboot_required']
))
if current_info[key]['reboot_required']:
current_info[key]['reboot_required_reasons'].append('FIPS configuration was changed.')

if all((
current_info['this_node']['reboot_required'] is False,
current_info['other_node']['reboot_required'] is False,
)):
# no reboot required for either controller so delete
await self.middleware.call('keyvalue.delete', FIPS_KEY)

for reason in self.reboot_reasons.values():
current_info['this_node']['reboot_required'] = True
current_info['this_node']['reboot_required_reasons'].append(reason)

return current_info

@api_method(FailoverRebootRequiredArgs, FailoverRebootRequiredResult, roles=['FAILOVER_READ'])
async def info(self):
"""Returns the local and remote nodes boot_ids along with their
reboot statuses (i.e. does a reboot need to take place)"""
return await self.info_impl()

@accepts(roles=['FAILOVER_READ'])
@returns(Bool())
async def required(self):
"""Returns whether this node needs to be rebooted for failover/security
system configuration changes to take effect."""
# TODO: should we raise Callerror/ValidationError if reboot_required is None?
return (await self.info())['this_node']['reboot_required'] is True

@accepts(roles=['FULL_ADMIN'])
@returns()
@api_method(FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult, roles=['FULL_ADMIN'])
@job(lock='reboot_standby')
async def other_node(self, job):
"""
Expand Down Expand Up @@ -145,3 +144,26 @@ async def other_node(self, job):

job.set_progress(100, 'Other controller rebooted successfully')
return True

@private
async def send_event(self, info=None):
if info is None:
info = await self.info()

self.middleware.send_event('failover.reboot.info', 'CHANGED', id=None, fields=info)


async def reboot_info(middleware, *args, **kwargs):
await middleware.call('failover.reboot.send_event')


def remote_reboot_info(middleware, *args, **kwargs):
middleware.call_sync('failover.reboot.send_event', background=True)


async def setup(middleware):
middleware.event_register('failover.reboot.info', 'Sent when a system reboot is required.', roles=['FAILOVER_READ'])

middleware.event_subscribe('system.reboot.info', remote_reboot_info)
await middleware.call('failover.remote_on_connect', remote_reboot_info)
await middleware.call('failover.remote_subscribe', 'system.reboot.info', remote_reboot_info)
41 changes: 10 additions & 31 deletions src/middlewared/middlewared/plugins/security/update.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import middlewared.sqlalchemy as sa

from middlewared.plugins.failover_.disabled_reasons import DisabledReasonsEnum
from middlewared.plugins.failover_.reboot import FIPS_KEY
from middlewared.schema import accepts, Bool, Dict, Int, Patch
from middlewared.service import ConfigService, ValidationError, job, private

FIPS_REBOOT_CODE = 'FIPS'
FIPS_REBOOT_REASON = 'FIPS configuration was changed.'


class SystemSecurityModel(sa.Model):
__tablename__ = 'system_security'
Expand Down Expand Up @@ -32,46 +34,23 @@ async def configure_fips_on_ha(self, is_ha, job):
return

await self.middleware.call('failover.call_remote', 'etc.generate', ['fips'])
boot_info = await self.middleware.call('failover.reboot.info')
if boot_info['this_node']['reboot_required']:
# means FIPS is being toggled but this node is already pending a reboot
# so it means the user toggled FIPS twice without a reboot in between
boot_info['this_node']['reboot_required'] = False
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
else:
# means FIPS is toggled and this node isn't pending a reboot, so mark it
# as such
boot_info['this_node']['reboot_required'] = True
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)

if boot_info['other_node']['reboot_required']:
# means FIPS is being toggled but other node is already pending a reboot
remote_reboot_reasons = await self.middleware.call('failover.call_remote', 'system.reboot.list_reasons')
if FIPS_REBOOT_CODE in remote_reboot_reasons:
# means FIPS is being toggled but other node is already pending a reboot,
# so it means the user toggled FIPS twice and somehow the other node
# didn't reboot (even though we do this automatically). This is an edge
# case and means someone or something is doing things behind our backs
boot_info['other_node']['reboot_required'] = False
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
await self.middleware.call('failover.call_remote', 'system.reboot.remove_reason', [FIPS_REBOOT_CODE])
else:
try:
# we automatically reboot (and wait for) the other controller
reboot_job = await self.middleware.call('failover.reboot.other_node')
await job.wrap(reboot_job)
except Exception:
self.logger.error('Unexpected failure rebooting the other node', exc_info=True)
# something extravagant happened so we'll just play it safe and say that
# something extravagant happened, so we'll just play it safe and say that
# another reboot is required
boot_info['other_node']['reboot_required'] = True
else:
new_info = await self.middleware.call('failover.reboot.info')
if boot_info['other_node']['id'] == new_info['other_node']['id']:
# standby "rebooted" but the boot id is the same....not good
self.logger.warning('Other node claims it rebooted but boot id is the same')
boot_info['other_node']['reboot_required'] = True
else:
boot_info['other_node']['id'] = new_info['other_node']['id']
boot_info['other_node']['reboot_required'] = False

await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
await self.middleware.call('failover.reboot.add_remote_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)

@private
async def validate(self, is_ha, ha_disabled_reasons):
Expand Down Expand Up @@ -125,7 +104,7 @@ async def do_update(self, job, data):
# TODO: We likely need to do some SSH magic as well
# let's investigate the exact configuration there
await self.middleware.call('etc.generate', 'fips')
# TODO: We need to fix this for non-HA iX hardware...
await self.middleware.call('system.reboot.toggle_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)
await self.configure_fips_on_ha(is_ha, job)

return await self.config()
Loading

0 comments on commit 79d4b32

Please sign in to comment.