Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New NBFT initramfs module #2620

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 75 additions & 10 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ mandir = join_paths(prefixdir, get_option('mandir'))
sbindir = join_paths(prefixdir, get_option('sbindir'))
sysconfdir = join_paths(prefixdir, get_option('sysconfdir'))

udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir'))
dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir'))
systemddir = join_paths(prefixdir, get_option('systemddir'))
rundir = join_paths(prefixdir, get_option('rundir'))
udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir'))
dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir'))
dracutmodulesdir = join_paths(prefixdir, get_option('dracutmodulesdir'))
systemddir = join_paths(prefixdir, get_option('systemddir'))
rundir = join_paths(prefixdir, get_option('rundir'))
networkmanagerdir = join_paths(prefixdir, get_option('networkmanagerdir'))

###############################################################################
conf = configuration_data()
Expand Down Expand Up @@ -199,14 +201,16 @@ substs.set('NAME', meson.project_name())
substs.set('VERSION', meson.project_version())
substs.set('LICENSE', meson.project_license()[0])
substs.set('UDEVRULESDIR', udevrulesdir)
substs.set('DRACUTRILESDIR', dracutrulesdir)
substs.set('DRACUTRULESDIR', dracutrulesdir)
substs.set('DRACUTMODULESDIR', dracutmodulesdir)
substs.set('REQUIRES', requires)
substs.set('DATADIR', datadir)
substs.set('MANDIR', mandir)
substs.set('RUNDIR', rundir)
substs.set('SBINDIR', sbindir)
substs.set('SYSCONFDIR', sysconfdir)
substs.set('SYSTEMDDIR', systemddir)
substs.set('NETWORKMANAGERDIR', networkmanagerdir)
substs.set('SYSTEMCTL', get_option('systemctl'))

configure_file(
Expand All @@ -221,18 +225,59 @@ disc = configure_file(
configuration: substs,
)

dracut_files = [
dracut_conf_files = [
'70-nvmf-autoconnect.conf',
]

foreach file : dracut_files
foreach file : dracut_conf_files
configure_file(
input: 'nvmf-autoconnect/dracut-conf/' + file + '.in',
output: file,
configuration: substs,
)
endforeach

want_dracut_module = get_option('dracut-module')
if want_dracut_module
dracut_nbft_files = [
'module-setup.sh',
'nbft-boot-pre.service',
'nbft-boot-connect.service'
]

foreach file : dracut_nbft_files
configure_file(
input: 'nvmf-autoconnect/dracut-95nbft/' + file + '.in',
output: file,
configuration: substs,
)
endforeach

networkmanager_conf_files = [
'99-nvme-nbft-no-ignore-carrier.conf'
]

foreach file : networkmanager_conf_files
configure_file(
input: 'nvmf-autoconnect/NetworkManager/' + file + '.in',
output: file,
configuration: substs,
)
endforeach

networkmanager_dispatcher_files = [
'99-nvme-nbft-connect.sh'
]

foreach file : networkmanager_dispatcher_files
configure_file(
input: 'nvmf-autoconnect/NetworkManager/' + file + '.in',
output: file,
configuration: substs,
)
endforeach
endif

systemd_files = [
'nvmefc-boot-connections.service',
'nvmf-autoconnect.service',
Expand Down Expand Up @@ -315,11 +360,28 @@ install_data('completions/bash-nvme-completion.sh',
install_data('completions/_nvme',
install_dir: datadir + '/zsh/site-functions')

foreach file : dracut_files
foreach file : dracut_conf_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: dracutrulesdir)
endforeach

if want_dracut_module
foreach file : dracut_nbft_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: dracutmodulesdir + '95nbft/')
endforeach

foreach file : networkmanager_conf_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: networkmanagerdir + 'conf.d/')
endforeach

foreach file : networkmanager_dispatcher_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: networkmanagerdir + 'dispatcher.d/')
endforeach
endif

foreach file : systemd_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: systemddir)
Expand All @@ -343,8 +405,10 @@ if meson.version().version_compare('>=0.53.0')
'mandir': mandir,
'udevrulesdir': udevrulesdir,
'dracutrulesdir': dracutrulesdir,
'dracutmodulesdir': dracutmodulesdir,
'rundir': rundir,
'systemddir': systemddir,
'networkmanagerdir': networkmanagerdir,
'build location': meson.current_build_dir(),
}
summary(path_dict, section: 'Paths')
Expand All @@ -353,8 +417,9 @@ if meson.version().version_compare('>=0.53.0')
}
summary(dep_dict, section: 'Dependencies')
conf_dict = {
'git version': conf.get('GIT_VERSION'),
'pdc enabled': get_option('pdc-enabled'),
'git version': conf.get('GIT_VERSION'),
'pdc enabled': get_option('pdc-enabled'),
'dracut module enabled': want_dracut_module
}
summary(conf_dict, section: 'Configuration')
endif
18 changes: 18 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ option(
value : 'lib/dracut/dracut.conf.d/',
description : 'directory for dracut rules files'
)
option(
'dracutmodulesdir',
type : 'string',
value : 'lib/dracut/modules.d/',
description : 'dracut modules directory'
)
option(
'htmldir',
type : 'string',
Expand Down Expand Up @@ -70,3 +76,15 @@ option(
type : 'string',
description : 'override the git version string'
)
option(
'dracut-module',
type : 'boolean',
value : false,
description : 'Enable the 95nbft dracut module'
)
option(
'networkmanagerdir',
type : 'string',
value : 'lib/NetworkManager/',
description : 'NetworkManager lib directory'
)
2 changes: 1 addition & 1 deletion nvme.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ touch %{buildroot}@SYSCONFDIR@/nvme/hostid
@UDEVRULESDIR@/65-persistent-net-nbft.rules
@UDEVRULESDIR@/70-nvmf-autoconnect.rules
@UDEVRULESDIR@/71-nvmf-netapp.rules
@DRACUTRILESDIR@/70-nvmf-autoconnect.conf
@DRACUTRULESDIR@/70-nvmf-autoconnect.conf
@SYSTEMDDIR@/[email protected]
@SYSTEMDDIR@/nvmefc-boot-connections.service
@SYSTEMDDIR@/nvmf-connect-nbft.service
Expand Down
5 changes: 5 additions & 0 deletions nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

Check failure on line 1 in nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

ERROR: do not set execute permissions for source files

Check failure on line 2 in nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

WARNING: Missing or malformed SPDX-License-Identifier tag in line 2
if [[ "$1" == nbft* ]] && [[ "$2" == "up" ]]; then
systemctl start nvmf-connect-nbft.service
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Boot from NVMe over TCP (NBFT)
#
# For NVMe/TCP connections that provide namespaces containing rootfs
# it is crucial to react on carrier events and reconnect any missing
# NVMe/TCP connections as defined in the ACPI NBFT table. A custom
# /usr/lib/NetworkManager/dispatcher.d/99-nvme-nbft-connect.sh hook
# will respawn nvmf-connect-nbft.service on such occasion.

[device-nbft-no-ignore-carrier]

# only affects nbft0, nbft1, ... interfaces
match-device=interface-name:nbft*

# react on link up/down events
ignore-carrier=no
125 changes: 125 additions & 0 deletions nvmf-autoconnect/dracut-95nbft/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# The dracut 95nbft module

Focused solely on providing the Boot from NVMe over TCP functionality, intended
to replace parts of the existing `95nvmf` dracut module. At the moment this all
depends on the recently added NetworkManager NBFT support, though the desire is
to support more network management frameworks in the future.

Related nvme-cli meson configure options:
* `-Ddracut-module` (default=false) - enables the 95nbft dracut module
* `-Ddracutmodulesdir` (default=`$prefix/lib/dracut/modules.d/`)
* `-Dnetworkmanagerdir` (default=`$prefix/lib/NetworkManager/`)


# The design

(see [dracut.bootup(7)](https://man7.org/linux/man-pages/man7/dracut.bootup.7.html)
for the overall boot process flow)

The boot process looks roughly as follows:
* `nbft-boot-pre.service` is run, creates udev network link files and tells
dracut to activate networking
* dracut runs `nm-initrd-generator` and starts the NetworkManager daemon
* `systemd-udev-trigger.service` renames the network interfaces
* `nm-wait-online-initrd.service` finishes, indicating networking is up and ready
* `nbft-boot-connect.service` initiates actual NVMe connections
* the dracut initqueue is waiting for specific block devices (rootfs) to appear

Two major packages are responsible for this: the new nvme-cli dracut module and
the added NBFT support in NetworkManager.

## The new dracut 95nbft module

The dracut `module-setup.sh` only installs two systemd unit files sandwiched
between specific dracut phases, nothing else. By default the module is always
included in the initramfs unless _hostonly_ is requested in which case the system
is tested for ACPI NBFT tables presence and the module is only included in such
a case.

The systemd unit files are only run when the ACPI NBFT tables are present and
no `rd.nvmf.nonbft` kernel commandline argument was provided that otherwise
instruct the boot process to skip the NBFT machinery.

## nbft-boot-pre.service

Calls the nvme-cli nbft plugin to generate network link files for each interface
found in all NBFT tables. The interface naming in form of `nbftXhY` consists
of an ACPI NBFT table index (defaults to 0) and the specified HFI index.
In a typical scenario only `nbft0h1`, `nbft0h2`, `nbft1h1`, ... interfaces are
present, however it's up to the pre-OS driver to supply arbitrary indexes,
possibly leading to interface names skipping the order to something like
`nbft0h100` and `nbft99h123`. Comparing to the old `95nvmf` dracut module
ordering, this new naming scheme is geared towards (semi-)stable predictable
network interface names. Keep in mind that the contents of the NBFT tables
is generated from scratch upon every system start and is not always persistent
between reboots.

The network link files are then picked up by udev on trigger via
`systemd-udev-trigger.service` to apply the new interface names.

For simplicity and for the time being this systemd unit replaces the traditional
dracut cmdline hook and adds the `rd.neednet=1` `cmdline.d` argument.

## nm-initrd-generator NBFT support

https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/2077

Executed before the NetworkManager daemon starts the added NBFT support parses
the ACPI NBFT tables available and generates system connections. Only
referenced by MAC addresses, relying on udev to perform actual interface
renaming.

The `nm-initrd-generator` doesn't link to `libnvme.so.1` but opens it through
`dlopen()` in runtime. This allows for smaller hostonly initramfs images in case
the NBFT tables are not present in the system. The library is being pulled in
indirectly through the dracut module's requirement of nvme-cli. The
`rd.nvmf.nonbft` kernel commandline argument is respected as well.

## nbft-boot-connect.service

Modprobes required modules (`nvme-fabrics`) first.

Performs actual NVMe connections by calling `nvme connect-all --nbft`. The
nvme-cli code has been modified to return non-zero return code in case one
or more SSNS records fail to connect (except those marked as _'unavailable'_
by the pre-OS driver), resulting in the service startup failure with defined
respawn of 10 seconds (TBD). This ensures multiple connection attempts while
NetworkManager reacts on link events in the background and the dracut initqueue
eagerly waits for new block devices to appears, to be scanned and mounted. Once
the required block device appears, the wait cycle is ended and the system
continues booting, stopping any queued `nbft-boot-connect.service` respawns
seamlessly.

The difference from the old dracut `95nvmf` module is that the nvme connection
attempts are not driven by network link up events but have fixed respawn
interval. This may potentially help the cases where the NIC is slow to
initialize, reports link up yet it takes another 5+ seconds before it's fully
able to send/receive packets. We've seen this issue with some 25Gb NICs.


# The post-switchroot boot flow

## nvmf-connect-nbft.service

This unit is supposed to run once the `network-online.target` has been reached
and calls `nvme connect-all --nbft` again. This ensures additional connection
attempt for records that failed to connect in the initramfs phase. As long as
this call matches existing connections and skips SSNS records that have been
already connected, in an ideal case this would result in an no-op. This is
mostly a one-shot service run in NetworkManager based distros since the target
typically stays reached until reboot.

## NetworkManager dispatcher hooks

The nvme-cli package installs a custom NetworkManager dispatcher service hook
(`99-nvme-nbft-connect.sh`) that just restarts `nvmf-connect-nbft.service` on
_link up_ events on `nbft*` interfaces. At the time the hook runs the interface
in question has been fully configured by NetworkManager. This ensures further
reconnection attempts in multipath scenarios where a network interface just came
alive. This is designed as a secondary measure with the kernel nvme host driver
connection recovery being the primary mechanism.

In order to make link events work properly the `nbft*` interfaces need to be set
not to ignore carrier events. This is done through a custom override snippet
(`99-nvme-nbft-no-ignore-carrier.conf`) as some distributions may opt to follow
legacy server networking behaviour (see the `NetworkManager-config-server` package).
49 changes: 49 additions & 0 deletions nvmf-autoconnect/dracut-95nbft/module-setup.sh.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/bash

Check failure on line 1 in nvmf-autoconnect/dracut-95nbft/module-setup.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

ERROR: do not set execute permissions for source files

Check failure on line 2 in nvmf-autoconnect/dracut-95nbft/module-setup.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

WARNING: Missing or malformed SPDX-License-Identifier tag in line 2
has_nbft() {
local f found=
for f in /sys/firmware/acpi/tables/NBFT*; do
[ -f "$f" ] || continue
found=1
break
done
[[ $found ]]
}

# called by dracut
check() {
require_binaries nvme || return 1

[[ $hostonly ]] || [[ $mount_needs ]] && {
if ! has_nbft; then
echo "No ACPI NBFT tables present in the system"
return 255
fi
}
return 0
}

# called by dracut
depends() {
echo bash rootfs-block network
return 0
}

# called by dracut
installkernel() {
hostonly="" instmods nvme_tcp nvme_fabrics 8021q
}

# called by dracut
install() {
inst_multiple nvme

# TODO: /etc/nvme/hostnqn

for i in \
nbft-boot-pre.service \
nbft-boot-connect.service; do
inst_simple "${moddir}/$i" "${systemdsystemunitdir}/$i"
$SYSTEMCTL -q --root "$initdir" enable $i
done
}
Loading
Loading