-
Notifications
You must be signed in to change notification settings - Fork 8
332 lines (279 loc) · 11.1 KB
/
test-kernel-module.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
name: Test Kernel Module
on:
workflow_dispatch: # Manual trigger for testing
inputs:
instance-type:
description: 'EC2 instance type to use'
required: false
default: 'm7i.metal-24xl'
type: string
run-benchmarks:
description: 'Run sync timer benchmarks'
required: false
default: false
type: boolean
push:
branches:
- main
paths:
- module/**
- .github/workflows/test-kernel-module.yaml
permissions:
id-token: write # Required for requesting the JWT
jobs:
start-runner:
name: Start EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/[email protected]
with:
mode: start
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2
ec2-instance-type: ${{ inputs.instance-type || 'm7i.xlarge' }} # m7i.metal-24xl for RDT, c5.9xlarge for perf support
market-type: spot
subnet-id: ${{ secrets.AWS_SUBNET_ID }}
security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
aws-resource-tags: >
[
{"Key": "Name", "Value": "github-runner"},
{"Key": "Repository", "Value": "${{ github.repository }}"},
{"Key": "Workflow", "Value": "${{ github.workflow }}"},
{"Key": "RunId", "Value": "${{ github.run_id }}"},
{"Key": "RunNumber", "Value": "${{ github.run_number }}"},
{"Key": "SHA", "Value": "${{ github.sha }}"},
{"Key": "Branch", "Value": "${{ github.ref_name }}"},
{"Key": "Actor", "Value": "${{ github.actor }}"}
]
test-module:
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
timeout-minutes: ${{ inputs.run-benchmarks == true && 5 || 2 }} # Add timeout in case system hangs
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Disable IPv6
run: |
# Disable IPv6 via sysctl
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1
# Force apt to use IPv4
echo 'Acquire::ForceIPv4 "true";' | sudo tee /etc/apt/apt.conf.d/99force-ipv4
- name: Configure apt to use HTTPS
run: |
# Update all archive URLs to use HTTPS
sudo sed -i 's/http:/https:/g' /etc/apt/sources.list
# Install apt-transport-https (might fail initially, hence the || true)
sudo apt-get update || true
sudo apt-get install -y apt-transport-https ca-certificates
# Update again with HTTPS now configured
sudo apt-get update
- name: Install build dependencies
run: |
# Install base dependencies
sudo apt-get install -y build-essential linux-headers-$(uname -r)
- name: Build kernel module
working-directory: module
run: |
# Try to compile and capture the warning message
make 2>&1 | tee compile_output.txt || true
# Extract gcc version from the warning message
KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt | grep -oP 'gcc-\K\d+' || echo "")
echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}"
# Install specific gcc version if detected
if [ ! -z "$KERNEL_GCC_VERSION" ]; then
echo "Installing gcc-${KERNEL_GCC_VERSION}"
sudo apt-get install -y gcc-${KERNEL_GCC_VERSION}
# Configure as default gcc
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100
sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION}
else
echo "Warning: Could not detect kernel compiler version"
fi
# Verify gcc version
gcc --version
# Now try the actual build
make
ls -l build/collector.ko
- name: Run RMID allocator tests
id: rmid-allocator-test
continue-on-error: true
working-directory: module
run: |
echo "Running RMID allocator unit tests..."
chmod +x test_rmid_allocator.sh
./test_rmid_allocator.sh
# Check test results from dmesg
echo "Test output from dmesg:"
dmesg | grep "rmid_allocator_test" || true
dmesg | grep "test_result:" || true
# Fail if any test failed
if dmesg | grep -q "test_result:.*:fail"; then
echo "RMID allocator tests failed"
exit 1
fi
- name: Run procfs tests
id: procfs-test
continue-on-error: true
working-directory: module
run: |
echo "Running procfs unit tests..."
chmod +x test_procfs.sh
./test_procfs.sh
# Check test results from dmesg
echo "Test output from dmesg:"
dmesg | grep "procfs_test" || true
dmesg | grep "test_result:" || true
# Fail if any test failed
if dmesg | grep -q "test_result:.*:fail"; then
echo "procfs tests failed"
exit 1
fi
- name: Run sync timer tests
id: sync-timer-test
continue-on-error: true
working-directory: module
run: |
echo "Running sync timer unit tests..."
chmod +x test_sync_timer.sh
./test_sync_timer.sh
# Check test results from dmesg
echo "Test output from dmesg:"
dmesg | grep "sync_timer_test" || true
dmesg | grep "test_result:" || true
# Fail if any test failed
if dmesg | grep -q "test_result:.*:fail"; then
echo "sync timer tests failed"
exit 1
fi
- name: Install stress tools
if: ${{ inputs.run-benchmarks }}
run: |
sudo apt-get install -y stress-ng jq
- name: Run benchmarks
if: ${{ inputs.run-benchmarks }}
working-directory: module
run: |
./benchmark_sync_timer_stress.sh > benchmark_results.csv
echo "Benchmark results:"
cat benchmark_results.csv
- name: Upload benchmark results
if: ${{ inputs.run-benchmarks }}
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: module/benchmark_results.csv
- name: Check dmesg on RMID test failure
if: steps.rmid-allocator-test.outcome == 'failure'
run: |
echo "RMID allocator tests failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Check dmesg on procfs test failure
if: steps.procfs-test.outcome == 'failure'
run: |
echo "procfs tests failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Check dmesg on sync timer test failure
if: steps.sync-timer-test.outcome == 'failure'
run: |
echo "sync timer tests failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Check RDT Capabilities
run: |
sudo mkdir -p /sys/fs/resctrl || true
sudo mount -t resctrl resctrl /sys/fs/resctrl || true
echo "Mounting resctrl filesystem"
mount | grep resctrl || true
echo "Checking RDT capabilities"
ls /sys/fs/resctrl/info || true
echo "Monitoring features:"
cat /sys/fs/resctrl/info/L3_MON/mon_features || true
echo "Number of available RMIDs:"
cat /sys/fs/resctrl/info/L3_MON/num_rmids || true
echo "Number of CAT classes:"
cat /sys/fs/resctrl/info/L3/num_closids || true
echo "head -n 35 /proc/cpuinfo:"
head -n 35 /proc/cpuinfo || true
echo "CPU RDT features (head):"
grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true
- name: Load and test module
id: load-and-test-module
continue-on-error: true
working-directory: module
run: |
# Check undefined symbols
sudo modinfo -F depends build/collector.ko
sudo objdump -d build/collector.ko | grep undefined || true
# Load module
echo "insmod build/collector.ko:"
sudo insmod build/collector.ko
# Verify module is loaded
echo "lsmod | grep collector:"
lsmod | grep collector
# Check kernel logs for module initialization
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
# Unload module
echo "rmmod collector:"
sudo rmmod collector
# Verify module unloaded successfully
echo "lsmod | grep collector:"
! lsmod | grep collector
if lsmod | grep -q collector; then
echo "Error: Module still loaded"
exit 1
fi
# Check kernel logs for cleanup message
echo "dmesg | grep 'Memory Collector':"
dmesg -c | grep "Memory Collector" || true
- name: Check dmesg on failure
if: steps.load-and-test-module.outcome == 'failure'
run: |
echo "load and test module failed, showing last kernel messages:"
sudo dmesg | tail -n 100
exit 1
- name: Install trace dependencies
run: |
sudo apt-get install -y trace-cmd
- name: Run module test script
working-directory: module
run: |
# run 10 times in quick succession to stress-test insmod/rmmod and collector
for i in {1..10}; do
echo "*** Run $i:"
./test_module.sh
done
stop-runner:
name: Stop EC2 runner
needs: [start-runner, test-module]
runs-on: ubuntu-latest
if: always() # Run even if previous jobs fail
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}
role-session-name: github-runner-session
- name: Stop EC2 runner
uses: machulav/[email protected]
with:
mode: stop
github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}