Skip to content

Commit

Permalink
Add experimental_datadog_agent_sampling
Browse files Browse the repository at this point in the history
This mode will change the behaviour of the router for tracing in the following ways:

* Spans are never dropped, instead they are converted to RecordOnly.
* Spans that are sent to otlp and datadog exporters will always look like they have been sampled.
* The `sampling.priority` attribute is populated on spans.
* `psr` is populated in trace state.
* `m` is populated in trace state.
  • Loading branch information
bryn committed Oct 3, 2024
1 parent e91a612 commit a5f6f0a
Show file tree
Hide file tree
Showing 38 changed files with 2,488 additions and 331 deletions.
48 changes: 48 additions & 0 deletions .changesets/fix_bryn_datadog_upstream_sampling_decision_test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
### Respect x-datadog-sampling-priority ([PR #6017](https://github.com/apollographql/router/pull/6017))

This PR consists of two fixes:
#### Datadog priority sampling resolution is not lost.

Previously a `x-datadog-sampling-priority` of `-1` would be converted to `0` for downstream requests and `2` would be converted to `1`.

#### The sampler option in the `telemetry.exporters.tracing.common.sampler` is not datadog aware.

To get accurate APM metrics all spans must be sent to the datadog agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision.

`preview_datadog_agent_sampling` option in the router.yaml enables this behavior and should be used when exporting to the datadog agent via OTLP or datadog native.

```yaml
telemetry:
exporters:
tracing:
common:
# Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you!
sampler: 0.1
# Send all spans to the Datadog agent.
preview_datadog_agent_sampling: true

# Example OTLP exporter configuration
otlp:
enabled: true
# Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario.
batch_processor:
max_concurrent_exports: 100

# Example Datadog native exporter configuration
datadog:
enabled: true

# Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario.
batch_processor:
max_concurrent_exports: 100
```
By using these options, you can decrease your Datadog bill as you will only be sending a percentage of spans from the Datadog agent to datadog.
> [!IMPORTANT]
> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics.

> [!IMPORTANT]
> Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter.

By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/6017
Original file line number Diff line number Diff line change
Expand Up @@ -7306,6 +7306,12 @@ expression: "&schema"
"description": "Whether to use parent based sampling",
"type": "boolean"
},
"preview_datadog_agent_sampling": {
"default": null,
"description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog",
"nullable": true,
"type": "boolean"
},
"resource": {
"additionalProperties": {
"$ref": "#/definitions/AttributeValue",
Expand Down
30 changes: 29 additions & 1 deletion apollo-router/src/plugins/telemetry/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use super::*;
use crate::plugin::serde::deserialize_option_header_name;
use crate::plugins::telemetry::metrics;
use crate::plugins::telemetry::resource::ConfigResource;
use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling;
use crate::Configuration;

#[derive(thiserror::Error, Debug)]
Expand Down Expand Up @@ -347,6 +348,9 @@ pub(crate) struct TracingCommon {
pub(crate) service_namespace: Option<String>,
/// The sampler, always_on, always_off or a decimal between 0.0 and 1.0
pub(crate) sampler: SamplerOption,
/// Use datadog agent sampling. This means that all spans will be sent to the Datadog agent
/// and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog
pub(crate) preview_datadog_agent_sampling: Option<bool>,
/// Whether to use parent based sampling
pub(crate) parent_based_sampler: bool,
/// The maximum events per span before discarding
Expand Down Expand Up @@ -401,6 +405,7 @@ impl Default for TracingCommon {
service_name: Default::default(),
service_namespace: Default::default(),
sampler: default_sampler(),
preview_datadog_agent_sampling: None,
parent_based_sampler: default_parent_based_sampler(),
max_events_per_span: default_max_events_per_span(),
max_attributes_per_span: default_max_attributes_per_span(),
Expand Down Expand Up @@ -668,8 +673,15 @@ impl From<&TracingCommon> for opentelemetry::sdk::trace::Config {
if config.parent_based_sampler {
sampler = parent_based(sampler);
}
if config.preview_datadog_agent_sampling.unwrap_or_default() {
common = common.with_sampler(DatadogAgentSampling::new(
sampler,
config.parent_based_sampler,
));
} else {
common = common.with_sampler(sampler);
}

common = common.with_sampler(sampler);
common = common.with_max_events_per_span(config.max_events_per_span);
common = common.with_max_attributes_per_span(config.max_attributes_per_span);
common = common.with_max_links_per_span(config.max_links_per_span);
Expand All @@ -688,6 +700,22 @@ fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::s

impl Conf {
pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result<f64, Error> {
// Because when datadog is enabled the global sampling is overriden to always_on
if self
.exporters
.tracing
.common
.preview_datadog_agent_sampling
.unwrap_or_default()
{
let field_ratio = match &self.apollo.field_level_instrumentation_sampler {
SamplerOption::TraceIdRatioBased(ratio) => *ratio,
SamplerOption::Always(Sampler::AlwaysOn) => 1.0,
SamplerOption::Always(Sampler::AlwaysOff) => 0.0,
};

return Ok(field_ratio);
}
Ok(
match (
&self.exporters.tracing.common.sampler,
Expand Down
110 changes: 102 additions & 8 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,20 @@ impl Plugin for Telemetry {
.expect("otel error handler lock poisoned, fatal");

let mut config = init.config;
// This code would have enabled datadog agent sampling by default, but for now we will leave it as opt-in.
// If the datadog exporter is enabled then enable the agent sampler.
// If users are using otlp export then they will need to set this explicitly in their config.
//
// if config.exporters.tracing.datadog.enabled()
// && config
// .exporters
// .tracing
// .common
// .preview_datadog_agent_sampling
// .is_none()
// {
// config.exporters.tracing.common.preview_datadog_agent_sampling = Some(true);
// }
config.instrumentation.spans.update_defaults();
config.instrumentation.instruments.update_defaults();
config.exporters.logging.validate()?;
Expand Down Expand Up @@ -866,7 +880,21 @@ impl Telemetry {
// Only apply things if we were executing in the context of a vanilla the Apollo executable.
// Users that are rolling their own routers will need to set up telemetry themselves.
if let Some(hot_tracer) = OPENTELEMETRY_TRACER_HANDLE.get() {
otel::layer::configure(&self.sampling_filter_ratio);
// If the datadog agent sampling is enabled, then we cannot presample the spans
// Therefore we set presampling to always on and let the regular sampler do the work.
// Effectively, we are disabling the presampling.
if self
.config
.exporters
.tracing
.common
.preview_datadog_agent_sampling
.unwrap_or_default()
{
otel::layer::configure(&SamplerOption::Always(Sampler::AlwaysOn));
} else {
otel::layer::configure(&self.sampling_filter_ratio);
}

// The reason that this has to happen here is that we are interacting with global state.
// If we do this logic during plugin init then if a subsequent plugin fails to init then we
Expand All @@ -889,7 +917,8 @@ impl Telemetry {

Self::checked_global_tracer_shutdown(last_provider);

opentelemetry::global::set_text_map_propagator(Self::create_propagator(&self.config));
let propagator = Self::create_propagator(&self.config);
opentelemetry::global::set_text_map_propagator(propagator);
}

activation.reload_metrics();
Expand Down Expand Up @@ -934,9 +963,6 @@ impl Telemetry {
if propagation.zipkin || tracing.zipkin.enabled {
propagators.push(Box::<opentelemetry_zipkin::Propagator>::default());
}
if propagation.datadog || tracing.datadog.enabled() {
propagators.push(Box::<tracing::datadog_exporter::DatadogPropagator>::default());
}
if propagation.aws_xray {
propagators.push(Box::<opentelemetry_aws::XrayPropagator>::default());
}
Expand All @@ -946,6 +972,9 @@ impl Telemetry {
propagation.request.format.clone(),
)));
}
if propagation.datadog || tracing.datadog.enabled() {
propagators.push(Box::<tracing::datadog_exporter::DatadogPropagator>::default());
}

TextMapCompositePropagator::new(propagators)
}
Expand All @@ -957,9 +986,14 @@ impl Telemetry {
let spans_config = &config.instrumentation.spans;
let mut common = tracing_config.common.clone();
let mut sampler = common.sampler.clone();
// set it to AlwaysOn: it is now done in the SamplingFilter, so whatever is sent to an exporter
// should be accepted
common.sampler = SamplerOption::Always(Sampler::AlwaysOn);

// To enable pre-sampling to work we need to disable regular sampling.
// This is because the pre-sampler will sample the spans before they sent to the regular sampler
// If the datadog agent sampling is enabled, then we cannot pre-sample the spans because even if the sampling decision is made to drop
// DatadogAgentSampler will modify the decision to RecordAndSample and instead use the sampling.priority attribute to decide if the span should be sampled or not.
if !common.preview_datadog_agent_sampling.unwrap_or_default() {
common.sampler = SamplerOption::Always(Sampler::AlwaysOn);
}

let mut builder =
opentelemetry::sdk::trace::TracerProvider::builder().with_config((&common).into());
Expand Down Expand Up @@ -2130,6 +2164,8 @@ mod tests {
use std::collections::HashMap;
use std::fmt::Debug;
use std::ops::DerefMut;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::time::Duration;
Expand Down Expand Up @@ -2187,6 +2223,7 @@ mod tests {
use crate::plugins::demand_control::COST_STRATEGY_KEY;
use crate::plugins::telemetry::config::TraceIdFormat;
use crate::plugins::telemetry::handle_error_internal;
use crate::plugins::telemetry::EnableSubgraphFtv1;
use crate::services::router::body::get_body_bytes;
use crate::services::RouterRequest;
use crate::services::RouterResponse;
Expand Down Expand Up @@ -2832,6 +2869,63 @@ mod tests {
.await;
}

#[tokio::test]
async fn test_field_instrumentation_sampler_with_preview_datadog_agent_sampling() {
let plugin = create_plugin_with_config(include_str!(
"testdata/config.field_instrumentation_sampler.router.yaml"
))
.await;

let ftv1_counter = Arc::new(AtomicUsize::new(0));
let ftv1_counter_cloned = ftv1_counter.clone();

let mut mock_request_service = MockSupergraphService::new();
mock_request_service
.expect_call()
.times(10)
.returning(move |req: SupergraphRequest| {
if req
.context
.extensions()
.with_lock(|lock| lock.contains_key::<EnableSubgraphFtv1>())
{
ftv1_counter_cloned.fetch_add(1, Ordering::Relaxed);
}
Ok(SupergraphResponse::fake_builder()
.context(req.context)
.status_code(StatusCode::OK)
.header("content-type", "application/json")
.data(json!({"errors": [{"message": "nope"}]}))
.build()
.unwrap())
});
let mut request_supergraph_service =
plugin.supergraph_service(BoxService::new(mock_request_service));

for _ in 0..10 {
let supergraph_req = SupergraphRequest::fake_builder()
.header("x-custom", "TEST")
.header("conditional-custom", "X")
.header("custom-length", "55")
.header("content-length", "55")
.header("content-type", "application/graphql")
.query("Query test { me {name} }")
.operation_name("test".to_string());
let _router_response = request_supergraph_service
.ready()
.await
.unwrap()
.call(supergraph_req.build().unwrap())
.await
.unwrap()
.next_response()
.await
.unwrap();
}
// It should be 100% because when we set preview_datadog_agent_sampling, we only take the value of field_level_instrumentation_sampler
assert_eq!(ftv1_counter.load(Ordering::Relaxed), 10);
}

#[tokio::test]
async fn test_subgraph_metrics_ok() {
async {
Expand Down
4 changes: 2 additions & 2 deletions apollo-router/src/plugins/telemetry/otel/layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,13 +677,13 @@ pub(crate) fn configure(sampler: &SamplerOption) {
},
};

SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::Relaxed);
SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::SeqCst);
}

impl<S, T> OpenTelemetryLayer<S, T> {
fn sample(&self) -> bool {
let s: f64 = thread_rng().gen_range(0.0..=1.0);
s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::Relaxed))
s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::SeqCst))
}
}

Expand Down
9 changes: 2 additions & 7 deletions apollo-router/src/plugins/telemetry/otel/tracer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use opentelemetry_sdk::trace::Tracer as SdkTracer;
use opentelemetry_sdk::trace::TracerProvider as SdkTracerProvider;

use super::OtelData;
use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState;

/// An interface for authors of OpenTelemetry SDKs to build pre-sampled tracers.
///
Expand Down Expand Up @@ -81,6 +80,7 @@ impl PreSampledTracer for SdkTracer {
let parent_cx = &data.parent_cx;
let builder = &mut data.builder;

// If we have a parent span that means we have a parent span coming from a propagator
// Gather trace state
let (trace_id, parent_trace_flags) = current_trace_state(builder, parent_cx, &provider);

Expand Down Expand Up @@ -159,12 +159,7 @@ fn process_sampling_result(
decision: SamplingDecision::RecordAndSample,
trace_state,
..
} => Some((
trace_flags | TraceFlags::SAMPLED,
trace_state
.with_priority_sampling(true)
.with_measuring(true),
)),
} => Some((trace_flags | TraceFlags::SAMPLED, trace_state.clone())),
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
telemetry:
instrumentation:
spans:
mode: spec_compliant
apollo:
field_level_instrumentation_sampler: 1.0
exporters:
tracing:
common:
preview_datadog_agent_sampling: true
sampler: 0.5
Loading

0 comments on commit a5f6f0a

Please sign in to comment.