Merge pull request #88 from yonch/main #10

Workflow file for this run

.github/workflows/test-kernel-module.yaml at 1087e6f

	name: Test Kernel Module
	on:
	workflow_dispatch: # Manual trigger for testing
	inputs:
	instance-type:
	description: 'EC2 instance type to use'
	required: false
	default: 'm7i.metal-24xl'
	type: string
	push:
	branches:
	- main
	paths:
	- module/**
	- .github/workflows/test-kernel-module.yaml

	permissions:
	id-token: write # Required for requesting the JWT

	jobs:
	start-runner:
	name: Start EC2 runner
	runs-on: ubuntu-latest
	outputs:
	label: ${{ steps.start-ec2-runner.outputs.label }}
	ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
	aws-region: ${{ secrets.AWS_REGION }}
	role-session-name: github-runner-session

	- name: Start EC2 runner
	id: start-ec2-runner
	uses: machulav/[email protected]
	with:
	mode: start
	github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
	ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2
	ec2-instance-type: ${{ inputs.instance-type \|\| 'm7i.xlarge' }} # m7i.metal-24xl for RDT, c5.9xlarge for perf support
	market-type: spot
	subnet-id: ${{ secrets.AWS_SUBNET_ID }}
	security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
	aws-resource-tags: >
	[
	{"Key": "Name", "Value": "github-runner"},
	{"Key": "Repository", "Value": "${{ github.repository }}"},
	{"Key": "Workflow", "Value": "${{ github.workflow }}"},
	{"Key": "RunId", "Value": "${{ github.run_id }}"},
	{"Key": "RunNumber", "Value": "${{ github.run_number }}"},
	{"Key": "SHA", "Value": "${{ github.sha }}"},
	{"Key": "Branch", "Value": "${{ github.ref_name }}"},
	{"Key": "Actor", "Value": "${{ github.actor }}"}
	]

	test-module:
	needs: start-runner
	runs-on: ${{ needs.start-runner.outputs.label }}
	timeout-minutes: 2 # Add timeout in case system hangs
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Disable IPv6
	run: \|
	# Disable IPv6 via sysctl
	sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
	sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
	sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1

	# Force apt to use IPv4
	echo 'Acquire::ForceIPv4 "true";' \| sudo tee /etc/apt/apt.conf.d/99force-ipv4

	- name: Configure apt to use HTTPS
	run: \|
	# Update all archive URLs to use HTTPS
	sudo sed -i 's/http:/https:/g' /etc/apt/sources.list

	# Install apt-transport-https (might fail initially, hence the \|\| true)
	sudo apt-get update \|\| true
	sudo apt-get install -y apt-transport-https ca-certificates

	# Update again with HTTPS now configured
	sudo apt-get update

	- name: Install build dependencies
	run: \|
	# Install base dependencies
	sudo apt-get install -y build-essential linux-headers-$(uname -r)

	- name: Build kernel module
	working-directory: module
	run: \|
	# Try to compile and capture the warning message
	make 2>&1 \| tee compile_output.txt \|\| true

	# Extract gcc version from the warning message
	KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt \| grep -oP 'gcc-\K\d+' \|\| echo "")
	echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}"

	# Install specific gcc version if detected
	if [ ! -z "$KERNEL_GCC_VERSION" ]; then
	echo "Installing gcc-${KERNEL_GCC_VERSION}"
	sudo apt-get install -y gcc-${KERNEL_GCC_VERSION}

	# Configure as default gcc
	sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100
	sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION}
	else
	echo "Warning: Could not detect kernel compiler version"
	fi

	# Verify gcc version
	gcc --version

	# Now try the actual build
	make

	ls -l build/collector.ko

	- name: Run RMID allocator tests
	id: rmid-allocator-test
	continue-on-error: true
	working-directory: module
	run: \|
	echo "Running RMID allocator unit tests..."
	chmod +x test_rmid_allocator.sh
	./test_rmid_allocator.sh

	# Check test results from dmesg
	echo "Test output from dmesg:"
	dmesg \| grep "rmid_allocator_test" \|\| true
	dmesg \| grep "test_result:" \|\| true

	# Fail if any test failed
	if dmesg \| grep -q "test_result:.*:fail"; then
	echo "RMID allocator tests failed"
	exit 1
	fi

	- name: Check dmesg on RMID test failure
	if: steps.rmid-allocator-test.outcome == 'failure'
	run: \|
	echo "RMID allocator tests failed, showing last kernel messages:"
	sudo dmesg \| tail -n 100
	exit 1

	- name: Check RDT Capabilities
	run: \|
	sudo mkdir -p /sys/fs/resctrl \|\| true
	sudo mount -t resctrl resctrl /sys/fs/resctrl \|\| true

	echo "Mounting resctrl filesystem"
	mount \| grep resctrl \|\| true

	echo "Checking RDT capabilities"
	ls /sys/fs/resctrl/info \|\| true

	echo "Monitoring features:"
	cat /sys/fs/resctrl/info/L3_MON/mon_features \|\| true

	echo "Number of available RMIDs:"
	cat /sys/fs/resctrl/info/L3_MON/num_rmids \|\| true

	echo "Number of CAT classes:"
	cat /sys/fs/resctrl/info/L3/num_closids \|\| true

	echo "head -n 35 /proc/cpuinfo:"
	head -n 35 /proc/cpuinfo \|\| true

	echo "CPU RDT features (head):"
	grep -E "cat_l3\|cdp_l3\|cqm_occup_llc\|cqm_mbm_total\|cqm_mbm_local" /proc/cpuinfo \| head \|\| true

	# we do not unmount, maybe mounting affects the intel_cqm checks below
	#sudo umount /sys/fs/resctrl \|\| true

	- name: Load and test module
	id: load-and-test-module
	continue-on-error: true
	working-directory: module
	run: \|

	# Check undefined symbols
	sudo modinfo -F depends build/collector.ko
	sudo objdump -d build/collector.ko \| grep undefined \|\| true

	# Load module
	echo "insmod build/collector.ko:"
	sudo insmod build/collector.ko

	# Verify module is loaded
	echo "lsmod \| grep collector:"
	lsmod \| grep collector

	# Check kernel logs for module initialization
	echo "dmesg \| grep 'Memory Collector':"
	dmesg -c \| grep "Memory Collector" \|\| true

	# Unload module
	echo "rmmod collector:"
	sudo rmmod collector

	# Verify module unloaded successfully
	echo "lsmod \| grep collector:"
	! lsmod \| grep collector
	if lsmod \| grep -q collector; then
	echo "Error: Module still loaded"
	exit 1
	fi

	# Check kernel logs for cleanup message
	echo "dmesg \| grep 'Memory Collector':"
	dmesg -c \| grep "Memory Collector" \|\| true

	- name: Check dmesg on failure
	if: steps.load-and-test-module.outcome == 'failure'
	run: \|
	echo "load and test module failed, showing last kernel messages:"
	sudo dmesg \| tail -n 100
	exit 1

	- name: Install trace dependencies
	run: \|
	sudo apt-get install -y trace-cmd

	- name: Run module test script
	working-directory: module
	run: \|
	# run 10 times in quick succession to stress-test insmod/rmmod and collector
	for i in {1..10}; do
	echo "*** Run $i:"
	./test_module.sh
	done


	stop-runner:
	name: Stop EC2 runner
	needs: [start-runner, test-module]
	runs-on: ubuntu-latest
	if: always() # Run even if previous jobs fail
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
	aws-region: ${{ secrets.AWS_REGION }}
	role-session-name: github-runner-session

	- name: Stop EC2 runner
	uses: machulav/[email protected]
	with:
	mode: stop
	github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
	label: ${{ needs.start-runner.outputs.label }}
	ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge pull request #88 from yonch/main #10

Workflow file

Merge pull request #88 from yonch/main #10

Jobs

Run details

Workflow file for this run