Skip to content

Commit

Permalink
Improve Windows builder retry diagnosability
Browse files Browse the repository at this point in the history
  • Loading branch information
dagood committed Jan 22, 2024
1 parent 34bc640 commit 6770686
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 73 deletions.
5 changes: 4 additions & 1 deletion eng/_core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ The high-level execution flow looks roughly like this when running in CI:

* `eng/pipeline/jobs/run-stage.yml`
runs:
* `eng/run.ps1 run-builder -builder linux-amd64-test -junitfile [...]`
* `eng/run.ps1 run-builder -test -builder linux-amd64-test -junitfile [...]`
which runs the Go function:
* `gotestsum.Run(... eng/run.ps1 build -test -json ...)`
which runs and captures the output of:
* `eng/run.ps1 build -test -json`
which runs [`cmd/build/build.go`](cmd/build/build.go) in this module.

This is not currently used in our CI because this process seems to cut off
some test output:
[microsoft/go#1114](https://github.com/microsoft/go/issues/1114).
6 changes: 0 additions & 6 deletions eng/_core/buildutil/buildutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,6 @@ func MaxMakeRetryAttemptsOrExit() int {
return maxAttemptsOrExit("GO_MAKE_MAX_RETRY_ATTEMPTS")
}

// MaxTestRetryAttemptsOrExit returns the max test retry attempts according to an env var. Shared
// between the build command and run-builder command.
func MaxTestRetryAttemptsOrExit() int {
return maxAttemptsOrExit("GO_TEST_MAX_RETRY_ATTEMPTS")
}

func maxAttemptsOrExit(varName string) int {
attempts, err := getEnvIntOrDefault(varName, 1)
if err != nil {
Expand Down
45 changes: 19 additions & 26 deletions eng/_core/cmd/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ func main() {
flag.StringVar(&o.Experiment, "experiment", "", "Include this string in GOEXPERIMENT.")

o.MaxMakeAttempts = buildutil.MaxMakeRetryAttemptsOrExit()
o.MaxTestAttempts = buildutil.MaxTestRetryAttemptsOrExit()

flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage:\n")
Expand Down Expand Up @@ -87,7 +86,6 @@ type options struct {
Experiment string

MaxMakeAttempts int
MaxTestAttempts int
}

func build(o *options) error {
Expand Down Expand Up @@ -216,30 +214,25 @@ func build(o *options) error {
testCommandLine = append(testCommandLine, "-json")
}

test := func() error {
testCmd := exec.Command(testCommandLine[0], testCommandLine[1:]...)
testCmd.Stdout = os.Stdout
// Redirect stderr to stdout. We expect some lines of stderr to always show up during the
// test run, but "build"'s caller might not understand that.
//
// For example, if we're running in CI, gotestsum may be capturing our output to report in a
// JUnit file. If gotestsum detects output in stderr, it prints it in an error message. This
// error message stands out, and could mislead someone trying to diagnose a failed test run.
// Redirecting all stderr output avoids this scenario. (See /eng/_core/README.md for more
// info on why we may be wrapped by gotestsum.)
//
// An example of benign stderr output is when the tests check for machine capabilities. A
// Cgo static linking test emits "/usr/bin/ld: cannot find -lc" when it checks the
// capabilities of "ld" on the current system.
//
// The stderr output isn't used to determine whether the tests succeeded or not. (The
// redirect doesn't cause an issue where tests succeed that should have failed.)
testCmd.Stderr = os.Stdout

return runCmd(testCmd)
}

if err := buildutil.Retry(o.MaxTestAttempts, test); err != nil {
testCmd := exec.Command(testCommandLine[0], testCommandLine[1:]...)
testCmd.Stdout = os.Stdout
// Redirect stderr to stdout. We expect some lines of stderr to always show up during the
// test run, but "build"'s caller might not understand that.
//
// For example, if we're running in CI, gotestsum may be capturing our output to report in a
// JUnit file. If gotestsum detects output in stderr, it prints it in an error message. This
// error message stands out, and could mislead someone trying to diagnose a failed test run.
// Redirecting all stderr output avoids this scenario. (See /eng/_core/README.md for more
// info on why we may be wrapped by gotestsum.)
//
// An example of benign stderr output is when the tests check for machine capabilities. A
// Cgo static linking test emits "/usr/bin/ld: cannot find -lc" when it checks the
// capabilities of "ld" on the current system.
//
// The stderr output isn't used to determine whether the tests succeeded or not. (The
// redirect doesn't cause an issue where tests succeed that should have failed.)
testCmd.Stderr = os.Stdout
if err := runCmd(testCmd); err != nil {
return err
}
}
Expand Down
8 changes: 7 additions & 1 deletion eng/_core/cmd/cmdscan/cmdscan.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import (
const description = `
Cmdscan runs the command given to it as args and monitors the output for text patterns that should
be elevated to AzDO Pipeline warnings using AzDO logging commands. Timeline events can be discovered
more easily in the UI and by automated tools like runfo.
more easily in the UI and by automated tools like runfo. This tool is also able to set a specified
AzDO variable to 'true' if the command succeeds, to help with retry implementation.
Uses the "log issue" pipeline logging command to create timeline warnings:
https://docs.microsoft.com/en-us/azure/devops/pipelines/scripts/logging-commands?view=azure-devops&tabs=bash#logissue-log-an-error-or-warning
Expand Down Expand Up @@ -67,6 +68,7 @@ var filters []filter
func main() {
help := flag.Bool("h", false, "Print this help message.")
prefix := flag.String("envprefix", "", "The env var prefix to use to find scan rules.")
successVar := flag.String("successvar", "", "The AzDO pipeline variable name to set to 'true' upon success.")

flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage:\n")
Expand Down Expand Up @@ -101,6 +103,10 @@ func main() {
if err := run(); err != nil {
log.Fatalln(err)
}

if *successVar != "" {
fmt.Printf("##vso[task.setvariable variable=%v]true\n", *successVar)
}
}

func parseFilter(name, envValue string) (*filter, error) {
Expand Down
20 changes: 14 additions & 6 deletions eng/_util/cmd/run-builder/run-builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ This command is used in CI to run a build/test/pack configuration.
Example: Build and run tests using the dev scripts:
eng/run.ps1 run-builder -builder linux-amd64-devscript
eng/run.ps1 run-builder -build -test -builder linux-amd64-devscript
For a list of builders that are run in CI, see 'azure-pipelines.yml'. This
doesn't include every builder that upstream uses. It also adds some builders
Expand All @@ -41,6 +41,9 @@ func main() {
var experiment = flag.String("experiment", "", "Include this string in GOEXPERIMENT.")
var fipsMode = flag.Bool("fipsmode", false, "Run the Go tests in FIPS mode.")
var jUnitFile = flag.String("junitfile", "", "Write a JUnit XML file to this path if this builder runs tests.")
var build = flag.Bool("build", false, "Run the build.")
var test = flag.Bool("test", false, "Run the tests.")

var help = flag.Bool("h", false, "Print this help message.")

flag.Usage = func() {
Expand Down Expand Up @@ -69,7 +72,6 @@ func main() {
goos, goarch, config := builderParts[0], builderParts[1], strings.Join(builderParts[2:], "-")
fmt.Printf("Found os '%s', arch '%s', config '%s'\n", goos, goarch, config)

maxTestRetries := buildutil.MaxTestRetryAttemptsOrExit()
// Scale this variable to increase timeout time based on scenario or builder speed.
timeoutScale := 1

Expand Down Expand Up @@ -111,8 +113,16 @@ func main() {
buildCmdline = append(buildCmdline, "-experiment", *experiment)
}

runOrPanic(buildCmdline...)
if *build {
runOrPanic(buildCmdline...)
} else {
fmt.Println("Skipping build: '-build' not passed.")
}

if !*test {
fmt.Println("Skipping tests: '-test' not passed.")
return
}
// After the build completes, run builder-specific commands.
switch config {
case "devscript":
Expand Down Expand Up @@ -180,9 +190,7 @@ func main() {
)
}

err := buildutil.Retry(maxTestRetries, func() error {
return runTest(cmdline, *jUnitFile)
})
err := runTest(cmdline, *jUnitFile)
// If we got an ExitError, the error message was already printed by the command. We just
// need to exit with the same exit code.
if exitErr, ok := err.(*exec.ExitError); ok {
Expand Down
4 changes: 4 additions & 0 deletions eng/pipeline/stages/builders-to-stages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ stages:
builder: ${{ builder }}
createSourceArchive: ${{ parameters.createSourceArchive }}
releaseVersion: ${{ parameters.releaseVersion }}
# Attempt to retry the build on Windows to mitigate flakiness:
# "Access Denied" during EXE copying and general flakiness during tests.
${{ if eq(builder.os, 'windows') }}:
retryAttempts: [1, 2, 3, 4, "FINAL"]

- ${{ if eq(parameters.sign, true) }}:
- template: pool.yml
Expand Down
84 changes: 51 additions & 33 deletions eng/pipeline/stages/run-stage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ parameters:
builder: {}
createSourceArchive: false
releaseVersion: ''
# List of retry attempt numbers. Pass a list of each index (values are only for display purposes),
# where the final element is "FINAL" instead. AzDO has limited looping capabilities, and this is a
# way around that.
retryAttempts: ["FINAL"]

stages:
- stage: ${{ parameters.builder.id }}
Expand Down Expand Up @@ -104,8 +108,6 @@ stages:
- pwsh: |
Write-Host "Increasing max build retries to mitigate 'Access denied' flakiness during EXE copying on Windows."
Write-Host "##vso[task.setvariable variable=GO_MAKE_MAX_RETRY_ATTEMPTS]5"
Write-Host "Increasing max test retries to mitigate access denied issues as well as general test flakiness on Windows."
Write-Host "##vso[task.setvariable variable=GO_TEST_MAX_RETRY_ATTEMPTS]5"
displayName: Increase 'make' retry attempts
# Initialize stage 0 toolset ahead of time so we can track timing data separately from the
Expand Down Expand Up @@ -192,21 +194,6 @@ stages:
- ${{ else }}:
- ${{ if eq(parameters.builder.os, 'windows') }}:
- pwsh: |
# Use the version of getmingw specified in the util module.
cd eng\_util
go install github.com/microsoft/go-infra/cmd/getmingw
& "$(go env GOPATH)/bin/getmingw" diagnose
& "$(go env GOPATH)/bin/getmingw" run -ci azdo -source nixman -version 13.2.0-rt_v11-rev0 -arch x86_64 -threading posix -exception seh -runtime msvcrt
displayName: Install MinGW
- pwsh: |
if ($IsWindows) {
Write-Host "Removing Git usr\bin from PATH to avoid running a Linux test that would fail, 'TestScript/script_wait'..."
Write-Host $env:PATH
$env:PATH = (
$env:PATH -split ';' | Where-Object { $_ -ne 'C:\Program Files\Git\usr\bin' }
) -join ';'
function RemovePathBinary($name) {
$src = (Get-Command $name -ErrorAction SilentlyContinue).Source
if ($src) {
Expand All @@ -224,26 +211,57 @@ stages:
# The Chocolatey shims are located in a single folder in PATH, so we can't change PATH to exclude it.
# Upstream Windows builders don't have SWIG installed, so this makes coverage even.
RemovePathBinary 'swig'
}
displayName: Remove unexpected tools
eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ -- `
pwsh eng/run.ps1 run-builder `
- pwsh: |
# Use the version of getmingw specified in the util module.
cd eng\_util
go install github.com/microsoft/go-infra/cmd/getmingw
& "$(go env GOPATH)/bin/getmingw" diagnose
& "$(go env GOPATH)/bin/getmingw" run -ci azdo -source nixman -version 13.2.0-rt_v11-rev0 -arch x86_64 -threading posix -exception seh -runtime msvcrt
displayName: Install MinGW
# Build. This includes retry logic internally if necessary for this builder.
- pwsh: |
eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ `
pwsh eng/run.ps1 run-builder -build `
-builder '${{ parameters.builder.os }}-${{ parameters.builder.arch }}-${{ parameters.builder.config }}' `
$(if ('${{ parameters.builder.experiment }}') { '-experiment'; '${{ parameters.builder.experiment }}' }) `
$(if ('${{ parameters.builder.fips }}') { '-fipsmode' }) `
-junitfile '$(Build.SourcesDirectory)/eng/artifacts/TestResults.xml'
displayName: Run ${{ parameters.builder.config }}
$(if ('${{ parameters.builder.fips }}') { '-fipsmode' })
displayName: Build
- task: PublishTestResults@2
displayName: Publish test results
condition: succeededOrFailed()
inputs:
testResultsFormat: JUnit
testResultsFiles: $(Build.SourcesDirectory)/eng/artifacts/TestResults.xml
testRunTitle: $(System.JobDisplayName)
buildPlatform: ${{ parameters.builder.arch }}
buildConfiguration: ${{ parameters.builder.config }}
publishRunAttachments: true
# Run each test retry attempt in its own step. Benefits over a single step:
#
# - The dev can immediately see whether any retries happened
# - Easier to navigate to a specific retry attempt
# - More stable viewing experience if job is ongoing
# - Automated tools can more easily check number of retries by relying on structure
#
# Test retries get their own steps but builds don't because builds are relatively quick,
# don't produce much output, and errors are easier to reproduce.
- ${{ each attempt in parameters.retryAttempts }}:
- pwsh: |
if ($IsWindows) {
Write-Host "Removing Git usr\bin from PATH to avoid running a Linux test that would fail, 'TestScript/script_wait'..."
Write-Host $env:PATH
$env:PATH = (
$env:PATH -split ';' | Where-Object { $_ -ne 'C:\Program Files\Git\usr\bin' }
) -join ';'
}
eng/run.ps1 cmdscan -envprefix GO_CMDSCAN_RULE_ -successvar TEST_BUILDER_SUCCESSFUL -- `
pwsh eng/run.ps1 run-builder -test `
-builder '${{ parameters.builder.os }}-${{ parameters.builder.arch }}-${{ parameters.builder.config }}' `
$(if ('${{ parameters.builder.experiment }}') { '-experiment'; '${{ parameters.builder.experiment }}' }) `
$(if ('${{ parameters.builder.fips }}') { '-fipsmode' })
${{ if eq(length(parameters.retryAttempts), 1) }}:
displayName: Test
${{ else }}:
displayName: Test (🔁 ${{ attempt }})
name: test_${{ attempt }}
condition: and(ne(variables['TEST_BUILDER_SUCCESSFUL'], 'true'), succeeded())
${{ if ne(attempt, 'FINAL') }}:
ignoreLASTEXITCODE: true
- ${{ if eq(parameters.builder.config, 'buildandpack' ) }}:
- template: ../../common/templates/steps/generate-sbom.yml
Expand Down

0 comments on commit 6770686

Please sign in to comment.