.github/workflows/test-kernel-module.yaml

name: Test Kernel Module
on: 
  workflow_dispatch:  # Manual trigger for testing
    inputs:
      instance-type:
        description: 'EC2 instance type to use'
        required: false
        default: 'm7i.metal-24xl'
        type: string
      run-benchmarks:
        description: 'Run sync timer benchmarks'
        required: false
        default: false
        type: boolean
  push:
    branches:
      - main
    paths:
      - module/**
      - .github/workflows/test-kernel-module.yaml

permissions:
  id-token: write # Required for requesting the JWT

jobs:
  start-runner:
    name: Start EC2 runner
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: ${{ secrets.AWS_REGION }}
          role-session-name: github-runner-session

      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@v2.3.8
        with:
          mode: start
          github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
          ec2-image-id: ami-0884d2865dbe9de4b  # Ubuntu 22.04 LTS in us-east-2
          ec2-instance-type: ${{ inputs.instance-type || 'm7i.xlarge' }}  # m7i.metal-24xl for RDT, c5.9xlarge for perf support
          market-type: spot
          subnet-id: ${{ secrets.AWS_SUBNET_ID }}
          security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
          aws-resource-tags: >
            [
              {"Key": "Name", "Value": "github-runner"},
              {"Key": "Repository", "Value": "${{ github.repository }}"},
              {"Key": "Workflow", "Value": "${{ github.workflow }}"},
              {"Key": "RunId", "Value": "${{ github.run_id }}"},
              {"Key": "RunNumber", "Value": "${{ github.run_number }}"},
              {"Key": "SHA", "Value": "${{ github.sha }}"},
              {"Key": "Branch", "Value": "${{ github.ref_name }}"},
              {"Key": "Actor", "Value": "${{ github.actor }}"}
            ]

  test-module:
    needs: start-runner
    runs-on: ${{ needs.start-runner.outputs.label }}
    timeout-minutes: ${{ inputs.run-benchmarks == true && 5 || 2 }}  # Add timeout in case system hangs
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Disable IPv6
        run: |
          # Disable IPv6 via sysctl
          sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
          sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
          sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1

          # Force apt to use IPv4
          echo 'Acquire::ForceIPv4 "true";' | sudo tee /etc/apt/apt.conf.d/99force-ipv4

      - name: Configure apt to use HTTPS
        run: |
          # Update all archive URLs to use HTTPS
          sudo sed -i 's/http:/https:/g' /etc/apt/sources.list

          # Install apt-transport-https (might fail initially, hence the || true)
          sudo apt-get update || true
          sudo apt-get install -y apt-transport-https ca-certificates

          # Update again with HTTPS now configured
          sudo apt-get update

      - name: Install build dependencies
        run: |
          # Install base dependencies
          sudo apt-get install -y build-essential linux-headers-$(uname -r)
          
      - name: Build kernel module
        working-directory: module
        run: |
          # Try to compile and capture the warning message
          make 2>&1 | tee compile_output.txt || true
          
          # Extract gcc version from the warning message
          KERNEL_GCC_VERSION=$(grep "The kernel was built by:" compile_output.txt | grep -oP 'gcc-\K\d+' || echo "")
          echo "Detected kernel compiler version: ${KERNEL_GCC_VERSION}"
          
          # Install specific gcc version if detected
          if [ ! -z "$KERNEL_GCC_VERSION" ]; then
            echo "Installing gcc-${KERNEL_GCC_VERSION}"
            sudo apt-get install -y gcc-${KERNEL_GCC_VERSION}
            
            # Configure as default gcc
            sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${KERNEL_GCC_VERSION} 100
            sudo update-alternatives --set gcc /usr/bin/gcc-${KERNEL_GCC_VERSION}
          else
            echo "Warning: Could not detect kernel compiler version"
          fi
          
          # Verify gcc version
          gcc --version
          
          # Now try the actual build
          make

          ls -l build/collector.ko

      - name: Run RMID allocator tests
        id: rmid-allocator-test
        continue-on-error: true
        working-directory: module
        run: |
          echo "Running RMID allocator unit tests..."
          chmod +x test_rmid_allocator.sh
          ./test_rmid_allocator.sh
          
          # Check test results from dmesg
          echo "Test output from dmesg:"
          dmesg | grep "rmid_allocator_test" || true
          dmesg | grep "test_result:" || true

          # Fail if any test failed
          if dmesg | grep -q "test_result:.*:fail"; then
            echo "RMID allocator tests failed"
            exit 1
          fi

      - name: Run procfs tests
        id: procfs-test
        continue-on-error: true
        working-directory: module
        run: |
          echo "Running procfs unit tests..."
          chmod +x test_procfs.sh
          ./test_procfs.sh
          
          # Check test results from dmesg
          echo "Test output from dmesg:"
          dmesg | grep "procfs_test" || true
          dmesg | grep "test_result:" || true

          # Fail if any test failed
          if dmesg | grep -q "test_result:.*:fail"; then
            echo "procfs tests failed"
            exit 1
          fi

      - name: Run sync timer tests
        id: sync-timer-test
        continue-on-error: true
        working-directory: module
        run: |
          echo "Running sync timer unit tests..."
          chmod +x test_sync_timer.sh
          ./test_sync_timer.sh
          
          # Check test results from dmesg
          echo "Test output from dmesg:"
          dmesg | grep "sync_timer_test" || true
          dmesg | grep "test_result:" || true

          # Fail if any test failed
          if dmesg | grep -q "test_result:.*:fail"; then
            echo "sync timer tests failed"
            exit 1
          fi

      - name: Install stress tools
        if: ${{ inputs.run-benchmarks }}
        run: |
          sudo apt-get install -y stress-ng jq

      - name: Run benchmarks
        if: ${{ inputs.run-benchmarks }}
        working-directory: module
        run: |
          ./benchmark_sync_timer_stress.sh > benchmark_results.csv
          echo "Benchmark results:"
          cat benchmark_results.csv

      - name: Upload benchmark results
        if: ${{ inputs.run-benchmarks }}
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          path: module/benchmark_results.csv

      - name: Check dmesg on RMID test failure
        if: steps.rmid-allocator-test.outcome == 'failure'
        run: |
          echo "RMID allocator tests failed, showing last kernel messages:"
          sudo dmesg | tail -n 100
          exit 1

      - name: Check dmesg on procfs test failure
        if: steps.procfs-test.outcome == 'failure'
        run: |
          echo "procfs tests failed, showing last kernel messages:"
          sudo dmesg | tail -n 100
          exit 1

      - name: Check dmesg on sync timer test failure
        if: steps.sync-timer-test.outcome == 'failure'
        run: |
          echo "sync timer tests failed, showing last kernel messages:"
          sudo dmesg | tail -n 100
          exit 1

      - name: Check RDT Capabilities
        run: |
          sudo mkdir -p /sys/fs/resctrl || true
          sudo mount -t resctrl resctrl /sys/fs/resctrl || true
          
          echo "Mounting resctrl filesystem"
          mount | grep resctrl || true
          
          echo "Checking RDT capabilities"
          ls /sys/fs/resctrl/info || true
          
          echo "Monitoring features:"
          cat /sys/fs/resctrl/info/L3_MON/mon_features || true
          
          echo "Number of available RMIDs:"
          cat /sys/fs/resctrl/info/L3_MON/num_rmids || true

          echo "Number of CAT classes:"
          cat /sys/fs/resctrl/info/L3/num_closids || true

          echo "head -n 35 /proc/cpuinfo:"
          head -n 35 /proc/cpuinfo || true

          echo "CPU RDT features (head):"
          grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true
      
      - name: Load and test module
        id: load-and-test-module
        continue-on-error: true
        working-directory: module
        run: |
          # Check undefined symbols
          sudo modinfo -F depends build/collector.ko
          sudo objdump -d build/collector.ko | grep undefined || true

          # Load module
          echo "insmod build/collector.ko:"
          sudo insmod build/collector.ko
          
          # Verify module is loaded
          echo "lsmod | grep collector:"
          lsmod | grep collector
          
          # Check kernel logs for module initialization
          echo "dmesg | grep 'Memory Collector':"
          dmesg -c | grep "Memory Collector" || true
          
          # Unload module
          echo "rmmod collector:"
          sudo rmmod collector
          
          # Verify module unloaded successfully
          echo "lsmod | grep collector:"
          ! lsmod | grep collector
          if lsmod | grep -q collector; then
            echo "Error: Module still loaded"
            exit 1
          fi
          
          # Check kernel logs for cleanup message
          echo "dmesg | grep 'Memory Collector':"
          dmesg -c | grep "Memory Collector" || true

      - name: Check dmesg on failure
        if: steps.load-and-test-module.outcome == 'failure'
        run: |
          echo "load and test module failed, showing last kernel messages:"
          sudo dmesg | tail -n 100
          exit 1

      - name: Install trace dependencies
        run: |
          sudo apt-get install -y trace-cmd

      - name: Run module test script
        working-directory: module
        run: |
          # run 10 times in quick succession to stress-test insmod/rmmod and collector
          for i in {1..10}; do
            echo "*** Run $i:"
            ./test_module.sh
          done


  stop-runner:
    name: Stop EC2 runner
    needs: [start-runner, test-module]
    runs-on: ubuntu-latest
    if: always()  # Run even if previous jobs fail
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: ${{ secrets.AWS_REGION }}
          role-session-name: github-runner-session

      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@v2.3.8
        with:
          mode: stop
          github-token: ${{ secrets.REPO_ADMIN_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}