From 03e61230afa7c9467c4ab10ff9d6b0cd2db1c7d9 Mon Sep 17 00:00:00 2001 From: ___ Date: Mon, 21 Aug 2023 20:10:26 +0000 Subject: [PATCH 1/2] Auto-create tracing spans for log groups This will make it easy for CI users to trace and analyze their CI jobs without requiring additional work or scripting on their part. They can just look at tracing information on their tracing provider. --- agent/agent_configuration.go | 1 + clicommand/agent_start.go | 13 ++++++++++++- clicommand/bootstrap.go | 7 +++++++ internal/job/config.go | 3 +++ internal/job/executor.go | 1 + internal/job/shell/shell.go | 37 +++++++++++++++++++++++++++++++++--- 6 files changed, 58 insertions(+), 4 deletions(-) diff --git a/agent/agent_configuration.go b/agent/agent_configuration.go index 5381d24c4e..171a231d32 100644 --- a/agent/agent_configuration.go +++ b/agent/agent_configuration.go @@ -57,4 +57,5 @@ type AgentConfiguration struct { AcquireJob string TracingBackend string TracingServiceName string + TraceLogGroups bool } diff --git a/clicommand/agent_start.go b/clicommand/agent_start.go index 90e90bf34d..b0c14c0f4a 100644 --- a/clicommand/agent_start.go +++ b/clicommand/agent_start.go @@ -142,6 +142,7 @@ type AgentStartConfig struct { MetricsDatadogDistributions bool `cli:"metrics-datadog-distributions"` TracingBackend string `cli:"tracing-backend"` TracingServiceName string `cli:"tracing-service-name"` + TraceLogGroups bool `cli:"trace-log-groups"` // Global flags Debug bool `cli:"debug"` @@ -582,6 +583,11 @@ var AgentStartCommand = cli.Command{ EnvVar: "BUILDKITE_TRACING_SERVICE_NAME", Value: "buildkite-agent", }, + cli.BoolFlag{ + Name: "trace-log-groups", + Usage: "Automatically creates tracing spans for log groups when tracing is enabled.", + EnvVar: "BUILDKITE_TRACE_LOG_GROUPS", + }, cli.StringFlag{ Name: "job-verification-key-path", Usage: "Path to a file containing a verification key. Passing this flag enables job verification. For hmac-sha256, the raw file content is used as the shared key", @@ -800,9 +806,13 @@ var AgentStartCommand = cli.Command{ }) // Sense check supported tracing backends, we don't want bootstrapped jobs to silently have no tracing - if _, has := tracetools.ValidTracingBackends[cfg.TracingBackend]; !has { + _, tracingEnabled := tracetools.ValidTracingBackends[cfg.TracingBackend] + if !tracingEnabled { l.Fatal("The given tracing backend %q is not supported. Valid backends are: %q", cfg.TracingBackend, maps.Keys(tracetools.ValidTracingBackends)) } + if !tracingEnabled && cfg.TraceLogGroups { + l.Fatal("Log group tracing cannot be enabled without a tracing backend.") + } if experiments.IsEnabled(experiments.AgentAPI) { shutdown := runAgentAPI(ctx, l, cfg.SocketsPath) @@ -864,6 +874,7 @@ var AgentStartCommand = cli.Command{ AcquireJob: cfg.AcquireJob, TracingBackend: cfg.TracingBackend, TracingServiceName: cfg.TracingServiceName, + TraceLogGroups: cfg.TraceLogGroups, JobVerificationNoSignatureBehavior: cfg.JobVerificationNoSignatureBehavior, JobVerificationInvalidSignatureBehavior: cfg.JobVerificationInvalidSignatureBehavior, diff --git a/clicommand/bootstrap.go b/clicommand/bootstrap.go index 4f82758862..dde8a0195e 100644 --- a/clicommand/bootstrap.go +++ b/clicommand/bootstrap.go @@ -94,6 +94,7 @@ type BootstrapConfig struct { RedactedVars []string `cli:"redacted-vars" normalize:"list"` TracingBackend string `cli:"tracing-backend"` TracingServiceName string `cli:"tracing-service-name"` + TraceLogGroups bool `cli:"trace-log-groups"` } var BootstrapCommand = cli.Command{ @@ -355,6 +356,11 @@ var BootstrapCommand = cli.Command{ EnvVar: "BUILDKITE_TRACING_SERVICE_NAME", Value: "buildkite-agent", }, + cli.BoolFlag{ + Name: "trace-log-groups", + Usage: "Automatically creates tracing spans for log groups when tracing is enabled.", + EnvVar: "BUILDKITE_TRACE_LOG_GROUPS", + }, DebugFlag, LogLevelFlag, ExperimentsFlag, @@ -439,6 +445,7 @@ var BootstrapCommand = cli.Command{ Tag: cfg.Tag, TracingBackend: cfg.TracingBackend, TracingServiceName: cfg.TracingServiceName, + TraceLogGroups: cfg.TraceLogGroups, }) cctx, cancel := context.WithCancel(ctx) diff --git a/internal/job/config.go b/internal/job/config.go index ac52f9aaf6..2925479df5 100644 --- a/internal/job/config.go +++ b/internal/job/config.go @@ -164,6 +164,9 @@ type ExecutorConfig struct { // Service name to use when reporting traces. TracingServiceName string + + // Whether to auto-create tracing spans for log groups. + TraceLogGroups bool } // ReadFromEnvironment reads configuration from the Environment, returns a map diff --git a/internal/job/executor.go b/internal/job/executor.go index 2598fa0a6d..74213fd06e 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -82,6 +82,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { e.shell.Debug = e.ExecutorConfig.Debug e.shell.InterruptSignal = e.ExecutorConfig.CancelSignal e.shell.SignalGracePeriod = e.ExecutorConfig.SignalGracePeriod + e.shell.TraceLogGroups = e.ExecutorConfig.TraceLogGroups } if experiments.IsEnabled(experiments.KubernetesExec) { kubernetesClient := &kubernetes.Client{} diff --git a/internal/job/shell/shell.go b/internal/job/shell/shell.go index 4654c772e7..5bccac0fa9 100644 --- a/internal/job/shell/shell.go +++ b/internal/job/shell/shell.go @@ -72,6 +72,9 @@ type Shell struct { // Amount of time to wait between sending the InterruptSignal and SIGKILL SignalGracePeriod time.Duration + + // Whether to auto create tracing spans for log groups + TraceLogGroups bool } // New returns a new Shell @@ -532,6 +535,32 @@ func round(d time.Duration) time.Duration { } } +// spanMakerWriter is an io.Writer shim that captures logs and automatically creates trace spans for the log group. +type spanMakerWriter struct { + w io.Writer + ctx context.Context + span opentracing.Span +} + +func (s *spanMakerWriter) Write(p []byte) (n int, err error) { + if bytes.HasPrefix(p, []byte("~~~ ")) || bytes.HasPrefix(p, []byte("--- ")) || bytes.HasPrefix(p, []byte("+++ ")) { + s.FinishIfActive() + operationName, _, _ := strings.Cut(string(p[4:]), "\r\n") + // We don't store the context bc we don't need to create child spans (yet). If we stored it, every log group would + // look like a child of the previous log group, where they're all more like siblings under the same parent span, + // since Buildkite itself has no concept of log group hierarchy. + s.span, _ = opentracing.StartSpanFromContext(s.ctx, operationName) + } + return s.w.Write(p) +} + +func (s *spanMakerWriter) FinishIfActive() { + if s.span != nil { + s.span.Finish() + s.span = nil + } +} + func (s *Shell) executeCommand( ctx context.Context, cmd *command, @@ -542,6 +571,8 @@ func (s *Shell) executeCommand( tracedEnv := env.FromSlice(cmd.Env) s.injectTraceCtx(ctx, tracedEnv) cmd.Env = tracedEnv.ToSlice() + logToSpanWriter := &spanMakerWriter{w: w, ctx: ctx, span: nil} + defer func() { logToSpanWriter.FinishIfActive() }() s.cmdLock.Lock() s.cmd = cmd @@ -561,11 +592,11 @@ func (s *Shell) executeCommand( // Modify process config based on execution flags if flags.PTY { cfg.PTY = true - cfg.Stdout = w + cfg.Stdout = logToSpanWriter } else { // Show stdout if requested or via debug if flags.Stdout { - cfg.Stdout = w + cfg.Stdout = logToSpanWriter } else if s.Debug { stdOutStreamer := NewLoggerStreamer(s.Logger) defer stdOutStreamer.Close() @@ -574,7 +605,7 @@ func (s *Shell) executeCommand( // Show stderr if requested or via debug if flags.Stderr { - cfg.Stderr = w + cfg.Stderr = logToSpanWriter } else if s.Debug { stdErrStreamer := NewLoggerStreamer(s.Logger) defer stdErrStreamer.Close() From 37d3ee993e44b8cc7e98f58d6de54cd7652afab7 Mon Sep 17 00:00:00 2001 From: ___ Date: Wed, 23 Aug 2023 16:14:10 +0000 Subject: [PATCH 2/2] Fix --- agent/job_runner.go | 1 + internal/job/shell/shell.go | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/agent/job_runner.go b/agent/job_runner.go index 2731afcf5e..eb851edc99 100644 --- a/agent/job_runner.go +++ b/agent/job_runner.go @@ -523,6 +523,7 @@ func (r *JobRunner) createEnvironment() ([]string, error) { if r.conf.AgentConfiguration.TracingBackend != "" { env["BUILDKITE_TRACING_BACKEND"] = r.conf.AgentConfiguration.TracingBackend env["BUILDKITE_TRACING_SERVICE_NAME"] = r.conf.AgentConfiguration.TracingServiceName + env["BUILDKITE_TRACE_LOG_GROUPS"] = fmt.Sprintf("%t", r.conf.AgentConfiguration.TraceLogGroups) } // see documentation for BuildkiteMessageMax diff --git a/internal/job/shell/shell.go b/internal/job/shell/shell.go index 5bccac0fa9..ec73e306da 100644 --- a/internal/job/shell/shell.go +++ b/internal/job/shell/shell.go @@ -571,8 +571,14 @@ func (s *Shell) executeCommand( tracedEnv := env.FromSlice(cmd.Env) s.injectTraceCtx(ctx, tracedEnv) cmd.Env = tracedEnv.ToSlice() - logToSpanWriter := &spanMakerWriter{w: w, ctx: ctx, span: nil} - defer func() { logToSpanWriter.FinishIfActive() }() + writer := w + writerCloser := func() {} + if s.TraceLogGroups { + logToSpanWriter := &spanMakerWriter{w: w, ctx: ctx, span: nil} + writer = logToSpanWriter + writerCloser = func() { logToSpanWriter.FinishIfActive() } + } + defer writerCloser() s.cmdLock.Lock() s.cmd = cmd @@ -592,11 +598,11 @@ func (s *Shell) executeCommand( // Modify process config based on execution flags if flags.PTY { cfg.PTY = true - cfg.Stdout = logToSpanWriter + cfg.Stdout = writer } else { // Show stdout if requested or via debug if flags.Stdout { - cfg.Stdout = logToSpanWriter + cfg.Stdout = writer } else if s.Debug { stdOutStreamer := NewLoggerStreamer(s.Logger) defer stdOutStreamer.Close() @@ -605,7 +611,7 @@ func (s *Shell) executeCommand( // Show stderr if requested or via debug if flags.Stderr { - cfg.Stderr = logToSpanWriter + cfg.Stderr = writer } else if s.Debug { stdErrStreamer := NewLoggerStreamer(s.Logger) defer stdErrStreamer.Close()