Merge pull request #52 from yonch:add-resctrl-module #1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test Kernel Module | |
on: | |
workflow_dispatch: # Manual trigger for testing | |
inputs: | |
instance-type: | |
description: 'EC2 instance type to use' | |
required: false | |
default: 'm7i.metal-24xl' | |
type: string | |
push: | |
branches: | |
- main | |
paths: | |
- module/** | |
permissions: | |
id-token: write # Required for requesting the JWT | |
jobs: | |
start-runner: | |
name: Start EC2 runner | |
runs-on: ubuntu-latest | |
outputs: | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
aws-region: ${{ secrets.AWS_REGION }} | |
role-session-name: github-runner-session | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: machulav/[email protected] | |
with: | |
mode: start | |
github-token: ${{ secrets.REPO_ADMIN_TOKEN }} | |
ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 | |
ec2-instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} | |
market-type: spot | |
subnet-id: ${{ secrets.AWS_SUBNET_ID }} | |
security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} | |
aws-resource-tags: > | |
[ | |
{"Key": "Name", "Value": "github-runner"}, | |
{"Key": "Repository", "Value": "${{ github.repository }}"}, | |
{"Key": "Workflow", "Value": "${{ github.workflow }}"}, | |
{"Key": "RunId", "Value": "${{ github.run_id }}"}, | |
{"Key": "RunNumber", "Value": "${{ github.run_number }}"}, | |
{"Key": "SHA", "Value": "${{ github.sha }}"}, | |
{"Key": "Branch", "Value": "${{ github.ref_name }}"}, | |
{"Key": "Actor", "Value": "${{ github.actor }}"} | |
] | |
test-module: | |
needs: start-runner | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
timeout-minutes: 2 # Add timeout in case system hangs | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Disable IPv6 | |
run: | | |
# Disable IPv6 via sysctl | |
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 | |
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 | |
sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 | |
# Force apt to use IPv4 | |
echo 'Acquire::ForceIPv4 "true";' | sudo tee /etc/apt/apt.conf.d/99force-ipv4 | |
- name: Configure apt to use HTTPS | |
run: | | |
# Update all archive URLs to use HTTPS | |
sudo sed -i 's/http:/https:/g' /etc/apt/sources.list | |
# Install apt-transport-https (might fail initially, hence the || true) | |
sudo apt-get update || true | |
sudo apt-get install -y apt-transport-https ca-certificates | |
# Update again with HTTPS now configured | |
sudo apt-get update | |
- name: Install build dependencies | |
run: | | |
# Install base dependencies | |
sudo apt-get install -y build-essential linux-headers-$(uname -r) | |
- name: Build kernel module | |
working-directory: module | |
run: | | |
# Try to compile and capture the warning message | |
make 2>&1 | tee compile_output.txt || true | |
# Extract gcc version from the warning message | |
KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt | grep -oP 'gcc-\K\d+' || echo "") | |
echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}" | |
# Install specific gcc version if detected | |
if [ ! -z "$KERNEL_GCC_VERSION" ]; then | |
echo "Installing gcc-${KERNEL_GCC_VERSION}" | |
sudo apt-get install -y gcc-${KERNEL_GCC_VERSION} | |
# Configure as default gcc | |
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100 | |
sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} | |
else | |
echo "Warning: Could not detect kernel compiler version" | |
fi | |
# Verify gcc version | |
gcc --version | |
# Now try the actual build | |
make | |
ls -l build/collector.ko | |
- name: Check RDT Capabilities | |
run: | | |
sudo mkdir -p /sys/fs/resctrl || true | |
sudo mount -t resctrl resctrl /sys/fs/resctrl || true | |
echo "Mounting resctrl filesystem" | |
mount | grep resctrl || true | |
echo "Checking RDT capabilities" | |
ls /sys/fs/resctrl/info || true | |
echo "Monitoring features:" | |
cat /sys/fs/resctrl/info/L3_MON/mon_features || true | |
echo "Number of available RMIDs:" | |
cat /sys/fs/resctrl/info/L3_MON/num_rmids || true | |
echo "Number of CAT classes:" | |
cat /sys/fs/resctrl/info/L3/num_closids || true | |
echo "head -n 35 /proc/cpuinfo:" | |
head -n 35 /proc/cpuinfo || true | |
echo "CPU RDT features (head):" | |
grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true | |
# we do not unmount, maybe mounting affects the intel_cqm checks below | |
#sudo umount /sys/fs/resctrl || true | |
- name: Load and test module | |
id: load-and-test-module | |
continue-on-error: true | |
working-directory: module | |
run: | | |
# Check undefined symbols | |
sudo modinfo -F depends build/collector.ko | |
sudo objdump -d build/collector.ko | grep undefined || true | |
# Load module | |
echo "insmod build/collector.ko:" | |
sudo insmod build/collector.ko | |
# Verify module is loaded | |
echo "lsmod | grep collector:" | |
lsmod | grep collector | |
# Check kernel logs for module initialization | |
echo "dmesg | grep 'Memory Collector':" | |
dmesg -c | grep "Memory Collector" || true | |
# Unload module | |
echo "rmmod collector:" | |
sudo rmmod collector | |
# Verify module unloaded successfully | |
echo "lsmod | grep collector:" | |
! lsmod | grep collector | |
if lsmod | grep -q collector; then | |
echo "Error: Module still loaded" | |
exit 1 | |
fi | |
# Check kernel logs for cleanup message | |
echo "dmesg | grep 'Memory Collector':" | |
dmesg -c | grep "Memory Collector" || true | |
- name: Check dmesg on failure | |
if: steps.load-and-test-module.outcome == 'failure' | |
run: | | |
echo "load and test module failed, showing last kernel messages:" | |
sudo dmesg | tail -n 100 | |
exit 1 | |
- name: Install trace dependencies | |
run: | | |
sudo apt-get install -y trace-cmd | |
- name: Run module test script | |
working-directory: module | |
run: | | |
# run 10 times in quick succession to stress-test insmod/rmmod and collector | |
for i in {1..10}; do | |
echo "*** Run $i:" | |
./test_module.sh | |
done | |
stop-runner: | |
name: Stop EC2 runner | |
needs: [start-runner, test-module] | |
runs-on: ubuntu-latest | |
if: always() # Run even if previous jobs fail | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
aws-region: ${{ secrets.AWS_REGION }} | |
role-session-name: github-runner-session | |
- name: Stop EC2 runner | |
uses: machulav/[email protected] | |
with: | |
mode: stop | |
github-token: ${{ secrets.REPO_ADMIN_TOKEN }} | |
label: ${{ needs.start-runner.outputs.label }} | |
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |