From c8534738f1bc359da96b99d9e6c1755c9eca549b Mon Sep 17 00:00:00 2001 From: Martin Sivak Date: Fri, 12 Jan 2024 10:38:55 +0100 Subject: [PATCH 1/2] OCPBUGS-25860: Disable managed interrupts for smartpqi This driver does not obey the isolcpus=managed_irq hint and is causing interference. This kernel argument makes sure the interrupt affinity can be managed by userspace services. The alternative approach using /etc/modprobe.d/smartpqi.conf with option smartpqi ... does not work, because the driver is loaded early at the initrd stage and we would have to rebuild the RHCOS initrd. That is much heavier than a simple kernel arg. --- assets/performanceprofile/tuned/openshift-node-performance | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/assets/performanceprofile/tuned/openshift-node-performance b/assets/performanceprofile/tuned/openshift-node-performance index 45e41f95c..5fa8c3466 100644 --- a/assets/performanceprofile/tuned/openshift-node-performance +++ b/assets/performanceprofile/tuned/openshift-node-performance @@ -170,6 +170,12 @@ cmdline_pstate=+intel_pstate=active cmdline_pstate=+intel_pstate=${automatic_pstate} {{end}} +# In case a smartpqi device is present, make sure the interrupt affinity +# can be managed by userspace services. This driver does not obey +# the isolcpus=managed_irq hint and is causing interference. +# See: https://issues.redhat.com/browse/OCPBUGS-25860 +cmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\bsmartpqi\b:smartpqi.disable_managed_interrupts=1:} + [rtentsk] {{ if .HardwareTuning }} From f2d6271cdf2193f053f7f1d4e188e384dbc7a272 Mon Sep 17 00:00:00 2001 From: Martin Sivak Date: Mon, 4 Mar 2024 14:27:18 +0100 Subject: [PATCH 2/2] Sync rendering tests --- .../extra-ctrcfg/openshift-bootstrap-master_tuned.yaml | 5 ++++- .../extra-ctrcfg/openshift-bootstrap-worker_tuned.yaml | 5 ++++- .../extra-mcp/openshift-bootstrap-master_tuned.yaml | 5 ++++- .../extra-mcp/openshift-bootstrap-worker_tuned.yaml | 5 ++++- .../bootstrap/no-mcp/openshift-bootstrap-master_tuned.yaml | 5 ++++- .../bootstrap/no-mcp/openshift-bootstrap-worker_tuned.yaml | 5 ++++- .../default/cpuFrequency/manual_tuned.yaml | 5 ++++- .../render-expected-output/default/manual_tuned.yaml | 5 ++++- .../testdata/render-expected-output/no-ref/manual_tuned.yaml | 5 ++++- 9 files changed, 36 insertions(+), 9 deletions(-) diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-master_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-master_tuned.yaml index 8356cb03e..3cb288ec3 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-master_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-master_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-master - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-worker_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-worker_tuned.yaml index 1d61753f6..24c33cd64 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-worker_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-ctrcfg/openshift-bootstrap-worker_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-worker - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-master_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-master_tuned.yaml index 8356cb03e..3cb288ec3 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-master_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-master_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-master - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-worker_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-worker_tuned.yaml index 1d61753f6..24c33cd64 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-worker_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/extra-mcp/openshift-bootstrap-worker_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-worker - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-master_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-master_tuned.yaml index 8356cb03e..3cb288ec3 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-master_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-master_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-master - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-worker_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-worker_tuned.yaml index 1d61753f6..24c33cd64 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-worker_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/bootstrap/no-mcp/openshift-bootstrap-worker_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n - \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# In case a + smartpqi device is present, make sure the interrupt affinity\n# can be managed + by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-openshift-bootstrap-worker - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/default/cpuFrequency/manual_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/default/cpuFrequency/manual_tuned.yaml index efeca7163..fd831ec8a 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/default/cpuFrequency/manual_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/default/cpuFrequency/manual_tuned.yaml @@ -49,7 +49,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n\ncmdline_hugepages=+ - default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=active\n\n\n[rtentsk]\n\n\n[sysfs]\n# + default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=active\n\n\n# + In case a smartpqi device is present, make sure the interrupt affinity\n# can + be managed by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n[sysfs]\n# sets provided frequencies to isolated and reserved cpus\n\n/sys/devices/system/cpu/cpufreq/policy2/scaling_max_freq=2500000\n/sys/devices/system/cpu/cpufreq/policy3/scaling_max_freq=2500000\n/sys/devices/system/cpu/cpufreq/policy0/scaling_max_freq=2800000\n/sys/devices/system/cpu/cpufreq/policy1/scaling_max_freq=2800000\n" name: openshift-node-performance-manual - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/default/manual_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/default/manual_tuned.yaml index a3e1da1f2..4ec3b097c 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/default/manual_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/default/manual_tuned.yaml @@ -51,7 +51,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n\ncmdline_hugepages=+ - default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# + In case a smartpqi device is present, make sure the interrupt affinity\n# can + be managed by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-manual - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance diff --git a/test/e2e/performanceprofile/testdata/render-expected-output/no-ref/manual_tuned.yaml b/test/e2e/performanceprofile/testdata/render-expected-output/no-ref/manual_tuned.yaml index 539d47389..f20071e3a 100644 --- a/test/e2e/performanceprofile/testdata/render-expected-output/no-ref/manual_tuned.yaml +++ b/test/e2e/performanceprofile/testdata/render-expected-output/no-ref/manual_tuned.yaml @@ -49,7 +49,10 @@ spec: tuned.non_isolcpus=${not_isolated_cpumask} systemd.cpu_affinity=${not_isolated_cores_expanded} intel_iommu=on iommu=pt\n\n\ncmdline_isolation=+isolcpus=managed_irq,${isolated_cores}\n\n\n\ncmdline_realtime=+nohz_full=${isolated_cores} tsc=reliable nosoftlockup nmi_watchdog=0 mce=off skew_tick=1 rcutree.kthread_prio=11\n\n\n\n\n\n\n\ncmdline_hugepages=+ - default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n[rtentsk]\n\n\n" + default_hugepagesz=1G hugepagesz=2M hugepages=128 \n\n\n\n\ncmdline_pstate=+intel_pstate=${automatic_pstate}\n\n\n# + In case a smartpqi device is present, make sure the interrupt affinity\n# can + be managed by userspace services. This driver does not obey\n# the isolcpus=managed_irq + hint and is causing interference.\n# See: https://issues.redhat.com/browse/OCPBUGS-25860\ncmdline_smartpqi=+${f:regex_search_ternary:${f:exec:/usr/sbin/lsmod}:\\bsmartpqi\\b:smartpqi.disable_managed_interrupts=1:}\n\n[rtentsk]\n\n\n" name: openshift-node-performance-manual - data: "[main]\nsummary=Real time profile to override unsupported settings\n\n[sysctl]\n#Real time kernel doesn't support the following kernel parameters.\n#The openshift-node-performance