Skip to content

Commit

Permalink
Merge branch 'unvariance:main' into feature/unvariance#9-resctrl_supp…
Browse files Browse the repository at this point in the history
…ort_checker
  • Loading branch information
darshan-dedhia authored Feb 12, 2025
2 parents 528e4cb + 2bef892 commit fa833eb
Show file tree
Hide file tree
Showing 17 changed files with 1,279 additions and 0 deletions.
18 changes: 18 additions & 0 deletions .devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "Kernel Module Development",
"dockerFile": "Dockerfile.devcontainer",
"workspaceFolder": "/workspace",
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
"customizations": {
"vscode": {
"extensions": [
"ms-vscode.cpptools",
"ms-vscode.makefile-tools",
"ms-azuretools.vscode-docker",
"ms-vscode.cmake-tools"
]
}
},
"remoteUser": "root"
}

4 changes: 4 additions & 0 deletions .github/workflows/Prometheus Integration Test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ name: Prometheus Integration Test
on:
push:
branches: [ "main" ]
paths:
- cmd/collector/**
pull_request:
branches: [ "main" ]
paths:
- cmd/collector/**

jobs:

Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/docs-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ on:
push:
branches:
- main
paths:
- docs/**
- mkdocs.yml
permissions:
contents: write
jobs:
Expand Down
123 changes: 123 additions & 0 deletions .github/workflows/get-resctrl-and-perf-info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
name: Get Resctrl and Perf info
on: workflow_dispatch # Manual trigger for testing

# Add permissions needed for OIDC authentication
permissions:
id-token: write # Required for requesting the JWT

jobs:
start-runner:
name: Start EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/[email protected]
with:
mode: start
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
ec2-image-id: ami-0cb91c7de36eed2cb # Ubuntu Server 24.04 LTS (HVM), SSD Volume Type
ec2-instance-type: m7i.metal-24xl
market-type: spot
subnet-id: ${{ secrets.AWS_SUBNET_ID }}
security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
pre-runner-script: |
sudo yum update -y && \
sudo yum install docker git libicu -y
sudo systemctl enable docker
aws-resource-tags: >
[
{"Key": "Name", "Value": "github-runner"},
{"Key": "Repository", "Value": "${{ github.repository }}"},
{"Key": "Workflow", "Value": "${{ github.workflow }}"},
{"Key": "RunId", "Value": "${{ github.run_id }}"},
{"Key": "RunNumber", "Value": "${{ github.run_number }}"},
{"Key": "SHA", "Value": "${{ github.sha }}"},
{"Key": "Branch", "Value": "${{ github.ref_name }}"},
{"Key": "Actor", "Value": "${{ github.actor }}"}
]
do-job:
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
steps:
- name: List perf counters
run: |
perf list
- name: List perf - detailed
run: |
perf list --long-desc --details || true
- name: Check perf paranoid
run: |
ls /proc/sys/kernel/perf_event_paranoid
cat /proc/sys/kernel/perf_event_paranoid
- name: Check RDT Capabilities
run: |
sudo mkdir -p /sys/fs/resctrl
sudo mount -t resctrl resctrl /sys/fs/resctrl || true
echo "Mounting resctrl filesystem"
mount | grep resctrl || true
echo "Checking RDT capabilities"
ls /sys/fs/resctrl/info || true
echo "Monitoring features:"
cat /sys/fs/resctrl/info/L3_MON/mon_features || true
echo "Number of available RMIDs:"
cat /sys/fs/resctrl/info/L3_MON/num_rmids || true
echo "Number of CAT classes:"
cat /sys/fs/resctrl/info/L3/num_closids || true
echo "CPU RDT features:"
grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo || true
# we do not unmount, maybe mounting affects the intel_cqm checks below
#sudo umount /sys/fs/resctrl || true
- name: Check intel_cqm
run: |
echo "*** Listing /sys/devices/intel_cqm"
ls -la /sys/devices/intel_cqm || true
echo "*** Traversing /sys/devices/intel_cqm/events"
find /sys/devices/intel_cqm/events || true
echo "checking type"
cat /sys/devices/intel_cqm/type || true
echo "reading llc_occupancy"
cat /sys/devices/intel_cqm/events/llc_occupancy || true
cat /sys/devices/intel_cqm/events/llc_occupancy.scale || true
- name: Power off
run: |
shutdown --poweroff now
stop-runner:
name: Stop EC2 runner
needs: [start-runner, do-job]
runs-on: ubuntu-latest
if: always() # Run even if previous jobs fail
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session

- name: Stop EC2 runner
uses: machulav/[email protected]
with:
mode: stop
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
229 changes: 229 additions & 0 deletions .github/workflows/test-kernel-module.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
name: Test Kernel Module
on:
workflow_dispatch: # Manual trigger for testing
inputs:
instance-type:
description: 'EC2 instance type to use'
required: false
default: 'm7i.metal-24xl'
type: string
push:
branches:
- main
paths:
- module/**

permissions:
id-token: write # Required for requesting the JWT

jobs:
start-runner:
name: Start EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/[email protected]
with:
mode: start
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2
ec2-instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }}
market-type: spot
subnet-id: ${{ secrets.AWS_SUBNET_ID }}
security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
aws-resource-tags: >
[
{"Key": "Name", "Value": "github-runner"},
{"Key": "Repository", "Value": "${{ github.repository }}"},
{"Key": "Workflow", "Value": "${{ github.workflow }}"},
{"Key": "RunId", "Value": "${{ github.run_id }}"},
{"Key": "RunNumber", "Value": "${{ github.run_number }}"},
{"Key": "SHA", "Value": "${{ github.sha }}"},
{"Key": "Branch", "Value": "${{ github.ref_name }}"},
{"Key": "Actor", "Value": "${{ github.actor }}"}
]
test-module:
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
timeout-minutes: 2 # Add timeout in case system hangs
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Disable IPv6
run: |
# Disable IPv6 via sysctl
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1
# Force apt to use IPv4
echo 'Acquire::ForceIPv4 "true";' | sudo tee /etc/apt/apt.conf.d/99force-ipv4
- name: Configure apt to use HTTPS
run: |
# Update all archive URLs to use HTTPS
sudo sed -i 's/http:/https:/g' /etc/apt/sources.list
# Install apt-transport-https (might fail initially, hence the || true)
sudo apt-get update || true
sudo apt-get install -y apt-transport-https ca-certificates
# Update again with HTTPS now configured
sudo apt-get update
- name: Install build dependencies
run: |
# Install base dependencies
sudo apt-get install -y build-essential linux-headers-$(uname -r)
- name: Build kernel module
working-directory: module
run: |
# Try to compile and capture the warning message
make 2>&1 | tee compile_output.txt || true
# Extract gcc version from the warning message
KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt | grep -oP 'gcc-\K\d+' || echo "")
echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}"
# Install specific gcc version if detected
if [ ! -z "$KERNEL_GCC_VERSION" ]; then
echo "Installing gcc-${KERNEL_GCC_VERSION}"
sudo apt-get install -y gcc-${KERNEL_GCC_VERSION}
# Configure as default gcc
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100
sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION}
else
echo "Warning: Could not detect kernel compiler version"
fi
# Verify gcc version
gcc --version
# Now try the actual build
make
ls -l build/collector.ko
- name: Check RDT Capabilities
run: |
sudo mkdir -p /sys/fs/resctrl || true
sudo mount -t resctrl resctrl /sys/fs/resctrl || true
echo "Mounting resctrl filesystem"
mount | grep resctrl || true
echo "Checking RDT capabilities"
ls /sys/fs/resctrl/info || true
echo "Monitoring features:"
cat /sys/fs/resctrl/info/L3_MON/mon_features || true
echo "Number of available RMIDs:"
cat /sys/fs/resctrl/info/L3_MON/num_rmids || true
echo "Number of CAT classes:"
cat /sys/fs/resctrl/info/L3/num_closids || true
echo "head -n 35 /proc/cpuinfo:"
head -n 35 /proc/cpuinfo || true
echo "CPU RDT features (head):"
grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true
# we do not unmount, maybe mounting affects the intel_cqm checks below
#sudo umount /sys/fs/resctrl || true
- name: Load and test module
id: load-and-test-module
continue-on-error: true
working-directory: module
run: |
# Check undefined symbols
sudo modinfo -F depends build/collector.ko
sudo objdump -d build/collector.ko | grep undefined || true
# Load module
echo "insmod build/collector.ko:"
sudo insmod build/collector.ko
# Verify module is loaded
echo "lsmod | grep collector:"
lsmod | grep collector
# Check kernel logs for module initialization
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
# Unload module
echo "rmmod collector:"
sudo rmmod collector
# Verify module unloaded successfully
echo "lsmod | grep collector:"
! lsmod | grep collector
if lsmod | grep -q collector; then
echo "Error: Module still loaded"
exit 1
fi
# Check kernel logs for cleanup message
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
- name: Check dmesg on failure
if: steps.load-and-test-module.outcome == 'failure'
run: |
echo "load and test module failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Install trace dependencies
run: |
sudo apt-get install -y trace-cmd
- name: Run module test script
working-directory: module
run: |
# run 10 times in quick succession to stress-test insmod/rmmod and collector
for i in {1..10}; do
echo "*** Run $i:"
./test_module.sh
done
stop-runner:
name: Stop EC2 runner
needs: [start-runner, test-module]
runs-on: ubuntu-latest
if: always() # Run even if previous jobs fail
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session

- name: Stop EC2 runner
uses: machulav/[email protected]
with:
mode: stop
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
Loading

0 comments on commit fa833eb

Please sign in to comment.