Skip to content

Merge pull request #88 from yonch/main #10

Merge pull request #88 from yonch/main

Merge pull request #88 from yonch/main #10

name: Test Kernel Module
on:
workflow_dispatch: # Manual trigger for testing
inputs:
instance-type:
description: 'EC2 instance type to use'
required: false
default: 'm7i.metal-24xl'
type: string
push:
branches:
- main
paths:
- module/**
- .github/workflows/test-kernel-module.yaml
permissions:
id-token: write # Required for requesting the JWT
jobs:
start-runner:
name: Start EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/[email protected]
with:
mode: start
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2
ec2-instance-type: ${{ inputs.instance-type || 'm7i.xlarge' }} # m7i.metal-24xl for RDT, c5.9xlarge for perf support
market-type: spot
subnet-id: ${{ secrets.AWS_SUBNET_ID }}
security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
aws-resource-tags: >
[
{"Key": "Name", "Value": "github-runner"},
{"Key": "Repository", "Value": "${{ github.repository }}"},
{"Key": "Workflow", "Value": "${{ github.workflow }}"},
{"Key": "RunId", "Value": "${{ github.run_id }}"},
{"Key": "RunNumber", "Value": "${{ github.run_number }}"},
{"Key": "SHA", "Value": "${{ github.sha }}"},
{"Key": "Branch", "Value": "${{ github.ref_name }}"},
{"Key": "Actor", "Value": "${{ github.actor }}"}
]
test-module:
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
timeout-minutes: 2 # Add timeout in case system hangs
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Disable IPv6
run: |
# Disable IPv6 via sysctl
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1
# Force apt to use IPv4
echo 'Acquire::ForceIPv4 "true";' | sudo tee /etc/apt/apt.conf.d/99force-ipv4
- name: Configure apt to use HTTPS
run: |
# Update all archive URLs to use HTTPS
sudo sed -i 's/http:/https:/g' /etc/apt/sources.list
# Install apt-transport-https (might fail initially, hence the || true)
sudo apt-get update || true
sudo apt-get install -y apt-transport-https ca-certificates
# Update again with HTTPS now configured
sudo apt-get update
- name: Install build dependencies
run: |
# Install base dependencies
sudo apt-get install -y build-essential linux-headers-$(uname -r)
- name: Build kernel module
working-directory: module
run: |
# Try to compile and capture the warning message
make 2>&1 | tee compile_output.txt || true
# Extract gcc version from the warning message
KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt | grep -oP 'gcc-\K\d+' || echo "")
echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}"
# Install specific gcc version if detected
if [ ! -z "$KERNEL_GCC_VERSION" ]; then
echo "Installing gcc-${KERNEL_GCC_VERSION}"
sudo apt-get install -y gcc-${KERNEL_GCC_VERSION}
# Configure as default gcc
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100
sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION}
else
echo "Warning: Could not detect kernel compiler version"
fi
# Verify gcc version
gcc --version
# Now try the actual build
make
ls -l build/collector.ko
- name: Run RMID allocator tests
id: rmid-allocator-test
continue-on-error: true
working-directory: module
run: |
echo "Running RMID allocator unit tests..."
chmod +x test_rmid_allocator.sh
./test_rmid_allocator.sh
# Check test results from dmesg
echo "Test output from dmesg:"
dmesg | grep "rmid_allocator_test" || true
dmesg | grep "test_result:" || true
# Fail if any test failed
if dmesg | grep -q "test_result:.*:fail"; then
echo "RMID allocator tests failed"
exit 1
fi
- name: Check dmesg on RMID test failure
if: steps.rmid-allocator-test.outcome == 'failure'
run: |
echo "RMID allocator tests failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Check RDT Capabilities
run: |
sudo mkdir -p /sys/fs/resctrl || true
sudo mount -t resctrl resctrl /sys/fs/resctrl || true
echo "Mounting resctrl filesystem"
mount | grep resctrl || true
echo "Checking RDT capabilities"
ls /sys/fs/resctrl/info || true
echo "Monitoring features:"
cat /sys/fs/resctrl/info/L3_MON/mon_features || true
echo "Number of available RMIDs:"
cat /sys/fs/resctrl/info/L3_MON/num_rmids || true
echo "Number of CAT classes:"
cat /sys/fs/resctrl/info/L3/num_closids || true
echo "head -n 35 /proc/cpuinfo:"
head -n 35 /proc/cpuinfo || true
echo "CPU RDT features (head):"
grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true
# we do not unmount, maybe mounting affects the intel_cqm checks below
#sudo umount /sys/fs/resctrl || true
- name: Load and test module
id: load-and-test-module
continue-on-error: true
working-directory: module
run: |
# Check undefined symbols
sudo modinfo -F depends build/collector.ko
sudo objdump -d build/collector.ko | grep undefined || true
# Load module
echo "insmod build/collector.ko:"
sudo insmod build/collector.ko
# Verify module is loaded
echo "lsmod | grep collector:"
lsmod | grep collector
# Check kernel logs for module initialization
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
# Unload module
echo "rmmod collector:"
sudo rmmod collector
# Verify module unloaded successfully
echo "lsmod | grep collector:"
! lsmod | grep collector
if lsmod | grep -q collector; then
echo "Error: Module still loaded"
exit 1
fi
# Check kernel logs for cleanup message
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
- name: Check dmesg on failure
if: steps.load-and-test-module.outcome == 'failure'
run: |
echo "load and test module failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Install trace dependencies
run: |
sudo apt-get install -y trace-cmd
- name: Run module test script
working-directory: module
run: |
# run 10 times in quick succession to stress-test insmod/rmmod and collector
for i in {1..10}; do
echo "*** Run $i:"
./test_module.sh
done
stop-runner:
name: Stop EC2 runner
needs: [start-runner, test-module]
runs-on: ubuntu-latest
if: always() # Run even if previous jobs fail
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session
- name: Stop EC2 runner
uses: machulav/[email protected]
with:
mode: stop
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}