Improve Windows builder retry diagnosability

microsoft · Jan 22, 2024 · 6770686 · 6770686
1 parent 34bc640
commit 6770686
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 73 deletions.
diff --git a/eng/_core/README.md b/eng/_core/README.md
@@ -18,10 +18,13 @@ The high-level execution flow looks roughly like this when running in CI:
 
 * `eng/pipeline/jobs/run-stage.yml`  
   runs:
-* `eng/run.ps1 run-builder -builder linux-amd64-test -junitfile [...]`  
+* `eng/run.ps1 run-builder -test -builder linux-amd64-test -junitfile [...]`  
   which runs the Go function:
 * `gotestsum.Run(... eng/run.ps1 build -test -json ...)`  
   which runs and captures the output of:
 * `eng/run.ps1 build -test -json`  
   which runs [`cmd/build/build.go`](cmd/build/build.go) in this module.
 
+This is not currently used in our CI because this process seems to cut off
+some test output:
+[microsoft/go#1114](https://github.com/microsoft/go/issues/1114).
diff --git a/eng/_core/buildutil/buildutil.go b/eng/_core/buildutil/buildutil.go
@@ -39,12 +39,6 @@ func MaxMakeRetryAttemptsOrExit() int {
 	return maxAttemptsOrExit("GO_MAKE_MAX_RETRY_ATTEMPTS")
 }
 
-// MaxTestRetryAttemptsOrExit returns the max test retry attempts according to an env var. Shared
-// between the build command and run-builder command.
-func MaxTestRetryAttemptsOrExit() int {
-	return maxAttemptsOrExit("GO_TEST_MAX_RETRY_ATTEMPTS")
-}
-
 func maxAttemptsOrExit(varName string) int {
 	attempts, err := getEnvIntOrDefault(varName, 1)
 	if err != nil {

diff --git a/eng/_core/cmd/build/build.go b/eng/_core/cmd/build/build.go
@@ -54,7 +54,6 @@ func main() {
 	flag.StringVar(&o.Experiment, "experiment", "", "Include this string in GOEXPERIMENT.")
 
 	o.MaxMakeAttempts = buildutil.MaxMakeRetryAttemptsOrExit()
-	o.MaxTestAttempts = buildutil.MaxTestRetryAttemptsOrExit()
 
 	flag.Usage = func() {
 		fmt.Fprintf(flag.CommandLine.Output(), "Usage:\n")
@@ -87,7 +86,6 @@ type options struct {
 	Experiment string
 
 	MaxMakeAttempts int
-	MaxTestAttempts int
 }
 
 func build(o *options) error {
@@ -216,30 +214,25 @@ func build(o *options) error {
 			testCommandLine = append(testCommandLine, "-json")
 		}
 
-		test := func() error {
-			testCmd := exec.Command(testCommandLine[0], testCommandLine[1:]...)
-			testCmd.Stdout = os.Stdout
-			// Redirect stderr to stdout. We expect some lines of stderr to always show up during the
-			// test run, but "build"'s caller might not understand that.
-			//
-			// For example, if we're running in CI, gotestsum may be capturing our output to report in a
-			// JUnit file. If gotestsum detects output in stderr, it prints it in an error message. This
-			// error message stands out, and could mislead someone trying to diagnose a failed test run.
-			// Redirecting all stderr output avoids this scenario. (See /eng/_core/README.md for more
-			// info on why we may be wrapped by gotestsum.)
-			//
-			// An example of benign stderr output is when the tests check for machine capabilities. A
-			// Cgo static linking test emits "/usr/bin/ld: cannot find -lc" when it checks the
-			// capabilities of "ld" on the current system.
-			//
-			// The stderr output isn't used to determine whether the tests succeeded or not. (The
-			// redirect doesn't cause an issue where tests succeed that should have failed.)
-			testCmd.Stderr = os.Stdout
-
-			return runCmd(testCmd)
-		}
-
-		if err := buildutil.Retry(o.MaxTestAttempts, test); err != nil {
+		testCmd := exec.Command(testCommandLine[0], testCommandLine[1:]...)
+		testCmd.Stdout = os.Stdout
+		// Redirect stderr to stdout. We expect some lines of stderr to always show up during the
+		// test run, but "build"'s caller might not understand that.
+		//
+		// For example, if we're running in CI, gotestsum may be capturing our output to report in a
+		// JUnit file. If gotestsum detects output in stderr, it prints it in an error message. This
+		// error message stands out, and could mislead someone trying to diagnose a failed test run.
+		// Redirecting all stderr output avoids this scenario. (See /eng/_core/README.md for more
+		// info on why we may be wrapped by gotestsum.)
+		//
+		// An example of benign stderr output is when the tests check for machine capabilities. A
+		// Cgo static linking test emits "/usr/bin/ld: cannot find -lc" when it checks the
+		// capabilities of "ld" on the current system.
+		//
+		// The stderr output isn't used to determine whether the tests succeeded or not. (The
+		// redirect doesn't cause an issue where tests succeed that should have failed.)
+		testCmd.Stderr = os.Stdout
+		if err := runCmd(testCmd); err != nil {
 			return err
 		}
 	}

diff --git a/eng/_core/cmd/cmdscan/cmdscan.go b/eng/_core/cmd/cmdscan/cmdscan.go
@@ -21,7 +21,8 @@ import (
 const description = `
 Cmdscan runs the command given to it as args and monitors the output for text patterns that should
 be elevated to AzDO Pipeline warnings using AzDO logging commands. Timeline events can be discovered
-more easily in the UI and by automated tools like runfo.
+more easily in the UI and by automated tools like runfo. This tool is also able to set a specified
+AzDO variable to 'true' if the command succeeds, to help with retry implementation.
 
 Uses the "log issue" pipeline logging command to create timeline warnings:
 https://docs.microsoft.com/en-us/azure/devops/pipelines/scripts/logging-commands?view=azure-devops&tabs=bash#logissue-log-an-error-or-warning
@@ -67,6 +68,7 @@ var filters []filter
 func main() {
 	help := flag.Bool("h", false, "Print this help message.")
 	prefix := flag.String("envprefix", "", "The env var prefix to use to find scan rules.")
+	successVar := flag.String("successvar", "", "The AzDO pipeline variable name to set to 'true' upon success.")
 
 	flag.Usage = func() {
 		fmt.Fprintf(flag.CommandLine.Output(), "Usage:\n")
@@ -101,6 +103,10 @@ func main() {
 	if err := run(); err != nil {
 		log.Fatalln(err)
 	}
+
+	if *successVar != "" {
+		fmt.Printf("##vso[task.setvariable variable=%v]true\n", *successVar)
+	}
 }
 
 func parseFilter(name, envValue string) (*filter, error) {

diff --git a/eng/_util/cmd/run-builder/run-builder.go b/eng/_util/cmd/run-builder/run-builder.go
@@ -22,7 +22,7 @@ This command is used in CI to run a build/test/pack configuration.
 
 Example: Build and run tests using the dev scripts:
 
-  eng/run.ps1 run-builder -builder linux-amd64-devscript
+  eng/run.ps1 run-builder -build -test -builder linux-amd64-devscript
 
 For a list of builders that are run in CI, see 'azure-pipelines.yml'. This
 doesn't include every builder that upstream uses. It also adds some builders
@@ -41,6 +41,9 @@ func main() {
 	var experiment = flag.String("experiment", "", "Include this string in GOEXPERIMENT.")
 	var fipsMode = flag.Bool("fipsmode", false, "Run the Go tests in FIPS mode.")
 	var jUnitFile = flag.String("junitfile", "", "Write a JUnit XML file to this path if this builder runs tests.")
+	var build = flag.Bool("build", false, "Run the build.")
+	var test = flag.Bool("test", false, "Run the tests.")
+
 	var help = flag.Bool("h", false, "Print this help message.")
 
 	flag.Usage = func() {
@@ -69,7 +72,6 @@ func main() {
 	goos, goarch, config := builderParts[0], builderParts[1], strings.Join(builderParts[2:], "-")
 	fmt.Printf("Found os '%s', arch '%s', config '%s'\n", goos, goarch, config)
 
-	maxTestRetries := buildutil.MaxTestRetryAttemptsOrExit()
 	// Scale this variable to increase timeout time based on scenario or builder speed.
 	timeoutScale := 1
 
@@ -111,8 +113,16 @@ func main() {
 		buildCmdline = append(buildCmdline, "-experiment", *experiment)
 	}
 
-	runOrPanic(buildCmdline...)
+	if *build {
+		runOrPanic(buildCmdline...)
+	} else {
+		fmt.Println("Skipping build: '-build' not passed.")
+	}
 
+	if !*test {
+		fmt.Println("Skipping tests: '-test' not passed.")
+		return
+	}
 	// After the build completes, run builder-specific commands.
 	switch config {
 	case "devscript":
@@ -180,9 +190,7 @@ func main() {
 			)
 		}
 
-		err := buildutil.Retry(maxTestRetries, func() error {
-			return runTest(cmdline, *jUnitFile)
-		})
+		err := runTest(cmdline, *jUnitFile)
 		// If we got an ExitError, the error message was already printed by the command. We just
 		// need to exit with the same exit code.
 		if exitErr, ok := err.(*exec.ExitError); ok {

diff --git a/eng/pipeline/stages/builders-to-stages.yml b/eng/pipeline/stages/builders-to-stages.yml
@@ -24,6 +24,10 @@ stages:
             builder: ${{ builder }}
             createSourceArchive: ${{ parameters.createSourceArchive }}
             releaseVersion: ${{ parameters.releaseVersion }}
+            # Attempt to retry the build on Windows to mitigate flakiness:
+            # "Access Denied" during EXE copying and general flakiness during tests.
+            ${{ if eq(builder.os, 'windows') }}:
+              retryAttempts: [1, 2, 3, 4, "FINAL"]
 
   - ${{ if eq(parameters.sign, true) }}:
     - template: pool.yml

diff --git a/eng/pipeline/stages/run-stage.yml b/eng/pipeline/stages/run-stage.yml
@@ -9,6 +9,10 @@ parameters:
   builder: {}
   createSourceArchive: false
   releaseVersion: ''
+  # List of retry attempt numbers. Pass a list of each index (values are only for display purposes),
+  # where the final element is "FINAL" instead. AzDO has limited looping capabilities, and this is a
+  # way around that.
+  retryAttempts: ["FINAL"]
 
 stages:
   - stage: ${{ parameters.builder.id }}
@@ -104,8 +108,6 @@ stages:
             - pwsh: |
                 Write-Host "Increasing max build retries to mitigate 'Access denied' flakiness during EXE copying on Windows."
                 Write-Host "##vso[task.setvariable variable=GO_MAKE_MAX_RETRY_ATTEMPTS]5"
-                Write-Host "Increasing max test retries to mitigate access denied issues as well as general test flakiness on Windows."
-                Write-Host "##vso[task.setvariable variable=GO_TEST_MAX_RETRY_ATTEMPTS]5"
               displayName: Increase 'make' retry attempts
 
           # Initialize stage 0 toolset ahead of time so we can track timing data separately from the
@@ -192,21 +194,6 @@ stages:
           - ${{ else }}:
             - ${{ if eq(parameters.builder.os, 'windows') }}:
               - pwsh: |
-                  # Use the version of getmingw specified in the util module.
-                  cd eng\_util
-                  go install github.com/microsoft/go-infra/cmd/getmingw
-                  & "$(go env GOPATH)/bin/getmingw" diagnose
-                  & "$(go env GOPATH)/bin/getmingw" run -ci azdo -source nixman -version 13.2.0-rt_v11-rev0 -arch x86_64 -threading posix -exception seh -runtime msvcrt
-                displayName: Install MinGW
-
-            - pwsh: |
-                if ($IsWindows) {
-                  Write-Host "Removing Git usr\bin from PATH to avoid running a Linux test that would fail, 'TestScript/script_wait'..."
-                  Write-Host $env:PATH
-                  $env:PATH = (
-                    $env:PATH -split ';' | Where-Object { $_ -ne 'C:\Program Files\Git\usr\bin' }
-                  ) -join ';'
-
                   function RemovePathBinary($name) {
                     $src = (Get-Command $name -ErrorAction SilentlyContinue).Source
                     if ($src) {
@@ -224,26 +211,57 @@ stages:
                   # The Chocolatey shims are located in a single folder in PATH, so we can't change PATH to exclude it.
                   # Upstream Windows builders don't have SWIG installed, so this makes coverage even.
                   RemovePathBinary 'swig'
-                }
+                displayName: Remove unexpected tools
 
-                eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ -- `
-                  pwsh eng/run.ps1 run-builder `
+              - pwsh: |
+                  # Use the version of getmingw specified in the util module.
+                  cd eng\_util
+                  go install github.com/microsoft/go-infra/cmd/getmingw
+                  & "$(go env GOPATH)/bin/getmingw" diagnose
+                  & "$(go env GOPATH)/bin/getmingw" run -ci azdo -source nixman -version 13.2.0-rt_v11-rev0 -arch x86_64 -threading posix -exception seh -runtime msvcrt
+                displayName: Install MinGW
+
+            # Build. This includes retry logic internally if necessary for this builder.
+            - pwsh: |
+                eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ `
+                  pwsh eng/run.ps1 run-builder -build `
                     -builder '${{ parameters.builder.os }}-${{ parameters.builder.arch }}-${{ parameters.builder.config }}' `
                     $(if ('${{ parameters.builder.experiment }}') { '-experiment'; '${{ parameters.builder.experiment }}' }) `
-                    $(if ('${{ parameters.builder.fips }}') { '-fipsmode' }) `
-                    -junitfile '$(Build.SourcesDirectory)/eng/artifacts/TestResults.xml'
-              displayName: Run ${{ parameters.builder.config }}
+                    $(if ('${{ parameters.builder.fips }}') { '-fipsmode' })
+              displayName: Build
 
-            - task: PublishTestResults@2
-              displayName: Publish test results
-              condition: succeededOrFailed()
-              inputs:
-                testResultsFormat: JUnit
-                testResultsFiles: $(Build.SourcesDirectory)/eng/artifacts/TestResults.xml
-                testRunTitle: $(System.JobDisplayName)
-                buildPlatform: ${{ parameters.builder.arch }}
-                buildConfiguration: ${{ parameters.builder.config }}
-                publishRunAttachments: true
+            # Run each test retry attempt in its own step. Benefits over a single step:
+            #
+            # - The dev can immediately see whether any retries happened
+            # - Easier to navigate to a specific retry attempt
+            # - More stable viewing experience if job is ongoing
+            # - Automated tools can more easily check number of retries by relying on structure
+            #
+            # Test retries get their own steps but builds don't because builds are relatively quick,
+            # don't produce much output, and errors are easier to reproduce.
+            - ${{ each attempt in parameters.retryAttempts }}:
+              - pwsh: |
+                  if ($IsWindows) {
+                    Write-Host "Removing Git usr\bin from PATH to avoid running a Linux test that would fail, 'TestScript/script_wait'..."
+                    Write-Host $env:PATH
+                    $env:PATH = (
+                      $env:PATH -split ';' | Where-Object { $_ -ne 'C:\Program Files\Git\usr\bin' }
+                    ) -join ';'
+                  }
+
+                  eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ -successvar TEST_BUILDER_SUCCESSFUL -- `
+                    pwsh eng/run.ps1 run-builder -test `
+                      -builder '${{ parameters.builder.os }}-${{ parameters.builder.arch }}-${{ parameters.builder.config }}' `
+                      $(if ('${{ parameters.builder.experiment }}') { '-experiment'; '${{ parameters.builder.experiment }}' }) `
+                      $(if ('${{ parameters.builder.fips }}') { '-fipsmode' })
+                ${{ if eq(length(parameters.retryAttempts), 1) }}:
+                  displayName: Test
+                ${{ else }}:
+                  displayName: Test (🔁 ${{ attempt }})
+                name: test_${{ attempt }}
+                condition: and(ne(variables['TEST_BUILDER_SUCCESSFUL'], 'true'), succeeded())
+                ${{ if ne(attempt, 'FINAL') }}:
+                  ignoreLASTEXITCODE: true
 
           - ${{ if eq(parameters.builder.config, 'buildandpack' ) }}:
             - template: ../../common/templates/steps/generate-sbom.yml