Skip to content

Commit

Permalink
Add preview_datadog_agent_sampling (#6112)
Browse files Browse the repository at this point in the history
  • Loading branch information
BrynCooke authored and lrlna committed Dec 18, 2024
1 parent 3bdcacd commit 9b62604
Show file tree
Hide file tree
Showing 63 changed files with 3,880 additions and 1,260 deletions.
34 changes: 34 additions & 0 deletions .changesets/fix_bryn_datadog_agent_sampling.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
### Enable accurate Datadog APM metrics ([PR #6017](https://github.com/apollographql/router/pull/6017))

The router supports a new preview feature, the `preview_datadog_agent_sampling` option, to enable sending all spans to the Datadog Agent so APM metrics and views are accurate.

Previously, the sampler option in `telemetry.exporters.tracing.common.sampler` wasn't Datadog-aware. To get accurate Datadog APM metrics, all spans must be sent to the Datadog Agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision.

The `preview_datadog_agent_sampling` option enables accurate Datadog APM metrics. It should be used when exporting to the Datadog Agent, via OTLP or Datadog-native.

```yaml
telemetry:
exporters:
tracing:
common:
# Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you!
sampler: 0.1
# Send all spans to the Datadog agent.
preview_datadog_agent_sampling: true


```

Using these options can decrease your Datadog bill, because you will be sending only a percentage of spans from the Datadog Agent to Datadog.

> [!IMPORTANT]
> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. Users that have been using recent versions of the router will have to modify their configuration to retain full APM metrics.
> [!IMPORTANT]
> The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent).
> Configuring `traces_per_second` in the Datadog Agent won't dynamically adjust the router's sampling rate to meet the target rate.
> [!IMPORTANT]
> Sending all spans to the Datadog Agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and Datadog native exporters.
Learn more by reading the [updated Datadog tracing documentation](https://apollographql.com/docs/router/configuration/telemetry/exporters/tracing/datadog) for more information on configuration options and their implications.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
### Fix transmitted header value for Datadog priority sampling resolution ([PR #6017](https://github.com/apollographql/router/pull/6017))

The router now transmits correct values of `x-datadog-sampling-priority` to downstream services.

Previously, an `x-datadog-sampling-priority` of `-1` was incorrectly converted to `0` for downstream requests, and `2` was incorrectly converted to `1`. When propagating to downstream services, this resulted in values of `USER_REJECT` being incorrectly transmitted as `AUTO_REJECT`.

Original file line number Diff line number Diff line change
Expand Up @@ -7405,6 +7405,12 @@ expression: "&schema"
"description": "Whether to use parent based sampling",
"type": "boolean"
},
"preview_datadog_agent_sampling": {
"default": null,
"description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog",
"nullable": true,
"type": "boolean"
},
"resource": {
"additionalProperties": {
"$ref": "#/definitions/AttributeValue",
Expand Down
30 changes: 29 additions & 1 deletion apollo-router/src/plugins/telemetry/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use super::*;
use crate::plugin::serde::deserialize_option_header_name;
use crate::plugins::telemetry::metrics;
use crate::plugins::telemetry::resource::ConfigResource;
use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling;
use crate::Configuration;

#[derive(thiserror::Error, Debug)]
Expand Down Expand Up @@ -347,6 +348,9 @@ pub(crate) struct TracingCommon {
pub(crate) service_namespace: Option<String>,
/// The sampler, always_on, always_off or a decimal between 0.0 and 1.0
pub(crate) sampler: SamplerOption,
/// Use datadog agent sampling. This means that all spans will be sent to the Datadog agent
/// and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog
pub(crate) preview_datadog_agent_sampling: Option<bool>,
/// Whether to use parent based sampling
pub(crate) parent_based_sampler: bool,
/// The maximum events per span before discarding
Expand Down Expand Up @@ -401,6 +405,7 @@ impl Default for TracingCommon {
service_name: Default::default(),
service_namespace: Default::default(),
sampler: default_sampler(),
preview_datadog_agent_sampling: None,
parent_based_sampler: default_parent_based_sampler(),
max_events_per_span: default_max_events_per_span(),
max_attributes_per_span: default_max_attributes_per_span(),
Expand Down Expand Up @@ -668,8 +673,15 @@ impl From<&TracingCommon> for opentelemetry::sdk::trace::Config {
if config.parent_based_sampler {
sampler = parent_based(sampler);
}
if config.preview_datadog_agent_sampling.unwrap_or_default() {
common = common.with_sampler(DatadogAgentSampling::new(
sampler,
config.parent_based_sampler,
));
} else {
common = common.with_sampler(sampler);
}

common = common.with_sampler(sampler);
common = common.with_max_events_per_span(config.max_events_per_span);
common = common.with_max_attributes_per_span(config.max_attributes_per_span);
common = common.with_max_links_per_span(config.max_links_per_span);
Expand All @@ -688,6 +700,22 @@ fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::s

impl Conf {
pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result<f64, Error> {
// Because when datadog is enabled the global sampling is overriden to always_on
if self
.exporters
.tracing
.common
.preview_datadog_agent_sampling
.unwrap_or_default()
{
let field_ratio = match &self.apollo.field_level_instrumentation_sampler {
SamplerOption::TraceIdRatioBased(ratio) => *ratio,
SamplerOption::Always(Sampler::AlwaysOn) => 1.0,
SamplerOption::Always(Sampler::AlwaysOff) => 0.0,
};

return Ok(field_ratio);
}
Ok(
match (
&self.exporters.tracing.common.sampler,
Expand Down
93 changes: 88 additions & 5 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,21 @@ impl PluginPrivate for Telemetry {
// Only apply things if we were executing in the context of a vanilla the Apollo executable.
// Users that are rolling their own routers will need to set up telemetry themselves.
if let Some(hot_tracer) = OPENTELEMETRY_TRACER_HANDLE.get() {
otel::layer::configure(&self.sampling_filter_ratio);
// If the datadog agent sampling is enabled, then we cannot presample the spans
// Therefore we set presampling to always on and let the regular sampler do the work.
// Effectively, we are disabling the presampling.
if self
.config
.exporters
.tracing
.common
.preview_datadog_agent_sampling
.unwrap_or_default()
{
otel::layer::configure(&SamplerOption::Always(Sampler::AlwaysOn));
} else {
otel::layer::configure(&self.sampling_filter_ratio);
}

// The reason that this has to happen here is that we are interacting with global state.
// If we do this logic during plugin init then if a subsequent plugin fails to init then we
Expand All @@ -885,7 +899,8 @@ impl PluginPrivate for Telemetry {

Self::checked_global_tracer_shutdown(last_provider);

opentelemetry::global::set_text_map_propagator(Self::create_propagator(&self.config));
let propagator = Self::create_propagator(&self.config);
opentelemetry::global::set_text_map_propagator(propagator);
}

activation.reload_metrics();
Expand Down Expand Up @@ -938,6 +953,9 @@ impl Telemetry {
if propagation.aws_xray {
propagators.push(Box::<opentelemetry_aws::XrayPropagator>::default());
}

// This propagator MUST come last because the user is trying to override the default behavior of the
// other propagators.
if let Some(from_request_header) = &propagation.request.header_name {
propagators.push(Box::new(CustomTraceIdPropagator::new(
from_request_header.to_string(),
Expand All @@ -955,9 +973,14 @@ impl Telemetry {
let spans_config = &config.instrumentation.spans;
let mut common = tracing_config.common.clone();
let mut sampler = common.sampler.clone();
// set it to AlwaysOn: it is now done in the SamplingFilter, so whatever is sent to an exporter
// should be accepted
common.sampler = SamplerOption::Always(Sampler::AlwaysOn);

// To enable pre-sampling to work we need to disable regular sampling.
// This is because the pre-sampler will sample the spans before they sent to the regular sampler
// If the datadog agent sampling is enabled, then we cannot pre-sample the spans because even if the sampling decision is made to drop
// DatadogAgentSampler will modify the decision to RecordAndSample and instead use the sampling.priority attribute to decide if the span should be sampled or not.
if !common.preview_datadog_agent_sampling.unwrap_or_default() {
common.sampler = SamplerOption::Always(Sampler::AlwaysOn);
}

let mut builder =
opentelemetry::sdk::trace::TracerProvider::builder().with_config((&common).into());
Expand Down Expand Up @@ -2132,6 +2155,8 @@ mod tests {
use std::collections::HashMap;
use std::fmt::Debug;
use std::ops::DerefMut;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::time::Duration;
Expand Down Expand Up @@ -2189,6 +2214,7 @@ mod tests {
use crate::plugins::demand_control::COST_STRATEGY_KEY;
use crate::plugins::telemetry::config::TraceIdFormat;
use crate::plugins::telemetry::handle_error_internal;
use crate::plugins::telemetry::EnableSubgraphFtv1;
use crate::services::router::body::get_body_bytes;
use crate::services::RouterRequest;
use crate::services::RouterResponse;
Expand Down Expand Up @@ -2834,6 +2860,63 @@ mod tests {
.await;
}

#[tokio::test]
async fn test_field_instrumentation_sampler_with_preview_datadog_agent_sampling() {
let plugin = create_plugin_with_config(include_str!(
"testdata/config.field_instrumentation_sampler.router.yaml"
))
.await;

let ftv1_counter = Arc::new(AtomicUsize::new(0));
let ftv1_counter_cloned = ftv1_counter.clone();

let mut mock_request_service = MockSupergraphService::new();
mock_request_service
.expect_call()
.times(10)
.returning(move |req: SupergraphRequest| {
if req
.context
.extensions()
.with_lock(|lock| lock.contains_key::<EnableSubgraphFtv1>())
{
ftv1_counter_cloned.fetch_add(1, Ordering::Relaxed);
}
Ok(SupergraphResponse::fake_builder()
.context(req.context)
.status_code(StatusCode::OK)
.header("content-type", "application/json")
.data(json!({"errors": [{"message": "nope"}]}))
.build()
.unwrap())
});
let mut request_supergraph_service =
plugin.supergraph_service(BoxService::new(mock_request_service));

for _ in 0..10 {
let supergraph_req = SupergraphRequest::fake_builder()
.header("x-custom", "TEST")
.header("conditional-custom", "X")
.header("custom-length", "55")
.header("content-length", "55")
.header("content-type", "application/graphql")
.query("Query test { me {name} }")
.operation_name("test".to_string());
let _router_response = request_supergraph_service
.ready()
.await
.unwrap()
.call(supergraph_req.build().unwrap())
.await
.unwrap()
.next_response()
.await
.unwrap();
}
// It should be 100% because when we set preview_datadog_agent_sampling, we only take the value of field_level_instrumentation_sampler
assert_eq!(ftv1_counter.load(Ordering::Relaxed), 10);
}

#[tokio::test]
async fn test_subgraph_metrics_ok() {
async {
Expand Down
4 changes: 2 additions & 2 deletions apollo-router/src/plugins/telemetry/otel/layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -678,13 +678,13 @@ pub(crate) fn configure(sampler: &SamplerOption) {
},
};

SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::Relaxed);
SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::SeqCst);
}

impl<S, T> OpenTelemetryLayer<S, T> {
fn sample(&self) -> bool {
let s: f64 = thread_rng().gen_range(0.0..=1.0);
s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::Relaxed))
s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::SeqCst))
}
}

Expand Down
9 changes: 2 additions & 7 deletions apollo-router/src/plugins/telemetry/otel/tracer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use opentelemetry_sdk::trace::Tracer as SdkTracer;
use opentelemetry_sdk::trace::TracerProvider as SdkTracerProvider;

use super::OtelData;
use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState;

/// An interface for authors of OpenTelemetry SDKs to build pre-sampled tracers.
///
Expand Down Expand Up @@ -81,6 +80,7 @@ impl PreSampledTracer for SdkTracer {
let parent_cx = &data.parent_cx;
let builder = &mut data.builder;

// If we have a parent span that means we have a parent span coming from a propagator
// Gather trace state
let (trace_id, parent_trace_flags) = current_trace_state(builder, parent_cx, &provider);

Expand Down Expand Up @@ -159,12 +159,7 @@ fn process_sampling_result(
decision: SamplingDecision::RecordAndSample,
trace_state,
..
} => Some((
trace_flags | TraceFlags::SAMPLED,
trace_state
.with_priority_sampling(true)
.with_measuring(true),
)),
} => Some((trace_flags | TraceFlags::SAMPLED, trace_state.clone())),
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
telemetry:
instrumentation:
spans:
mode: spec_compliant
apollo:
field_level_instrumentation_sampler: 1.0
exporters:
tracing:
common:
preview_datadog_agent_sampling: true
sampler: 0.5
Loading

0 comments on commit 9b62604

Please sign in to comment.