diff --git a/.changesets/fix_bryn_datadog_agent_sampling.md b/.changesets/fix_bryn_datadog_agent_sampling.md new file mode 100644 index 0000000000..50c2d6997d --- /dev/null +++ b/.changesets/fix_bryn_datadog_agent_sampling.md @@ -0,0 +1,34 @@ +### Enable accurate Datadog APM metrics ([PR #6017](https://github.com/apollographql/router/pull/6017)) + +The router supports a new preview feature, the `preview_datadog_agent_sampling` option, to enable sending all spans to the Datadog Agent so APM metrics and views are accurate. + +Previously, the sampler option in `telemetry.exporters.tracing.common.sampler` wasn't Datadog-aware. To get accurate Datadog APM metrics, all spans must be sent to the Datadog Agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision. + +The `preview_datadog_agent_sampling` option enables accurate Datadog APM metrics. It should be used when exporting to the Datadog Agent, via OTLP or Datadog-native. + +```yaml +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + # Send all spans to the Datadog agent. + preview_datadog_agent_sampling: true + + +``` + +Using these options can decrease your Datadog bill, because you will be sending only a percentage of spans from the Datadog Agent to Datadog. + +> [!IMPORTANT] +> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. Users that have been using recent versions of the router will have to modify their configuration to retain full APM metrics. + +> [!IMPORTANT] +> The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). +> Configuring `traces_per_second` in the Datadog Agent won't dynamically adjust the router's sampling rate to meet the target rate. + +> [!IMPORTANT] +> Sending all spans to the Datadog Agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and Datadog native exporters. + +Learn more by reading the [updated Datadog tracing documentation](https://apollographql.com/docs/router/configuration/telemetry/exporters/tracing/datadog) for more information on configuration options and their implications. \ No newline at end of file diff --git a/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md new file mode 100644 index 0000000000..d05f173528 --- /dev/null +++ b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md @@ -0,0 +1,6 @@ +### Fix transmitted header value for Datadog priority sampling resolution ([PR #6017](https://github.com/apollographql/router/pull/6017)) + +The router now transmits correct values of `x-datadog-sampling-priority` to downstream services. + +Previously, an `x-datadog-sampling-priority` of `-1` was incorrectly converted to `0` for downstream requests, and `2` was incorrectly converted to `1`. When propagating to downstream services, this resulted in values of `USER_REJECT` being incorrectly transmitted as `AUTO_REJECT`. + diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index 2713e63dc5..36284c3c3c 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -7336,6 +7336,12 @@ expression: "&schema" "description": "Whether to use parent based sampling", "type": "boolean" }, + "preview_datadog_agent_sampling": { + "default": null, + "description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog", + "nullable": true, + "type": "boolean" + }, "resource": { "additionalProperties": { "$ref": "#/definitions/AttributeValue", diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 4c9be01135..8dc84e85c0 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -24,6 +24,7 @@ use super::*; use crate::plugin::serde::deserialize_option_header_name; use crate::plugins::telemetry::metrics; use crate::plugins::telemetry::resource::ConfigResource; +use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling; use crate::Configuration; #[derive(thiserror::Error, Debug)] @@ -347,6 +348,9 @@ pub(crate) struct TracingCommon { pub(crate) service_namespace: Option, /// The sampler, always_on, always_off or a decimal between 0.0 and 1.0 pub(crate) sampler: SamplerOption, + /// Use datadog agent sampling. This means that all spans will be sent to the Datadog agent + /// and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog + pub(crate) preview_datadog_agent_sampling: Option, /// Whether to use parent based sampling pub(crate) parent_based_sampler: bool, /// The maximum events per span before discarding @@ -401,6 +405,7 @@ impl Default for TracingCommon { service_name: Default::default(), service_namespace: Default::default(), sampler: default_sampler(), + preview_datadog_agent_sampling: None, parent_based_sampler: default_parent_based_sampler(), max_events_per_span: default_max_events_per_span(), max_attributes_per_span: default_max_attributes_per_span(), @@ -668,8 +673,15 @@ impl From<&TracingCommon> for opentelemetry::sdk::trace::Config { if config.parent_based_sampler { sampler = parent_based(sampler); } + if config.preview_datadog_agent_sampling.unwrap_or_default() { + common = common.with_sampler(DatadogAgentSampling::new( + sampler, + config.parent_based_sampler, + )); + } else { + common = common.with_sampler(sampler); + } - common = common.with_sampler(sampler); common = common.with_max_events_per_span(config.max_events_per_span); common = common.with_max_attributes_per_span(config.max_attributes_per_span); common = common.with_max_links_per_span(config.max_links_per_span); @@ -688,6 +700,22 @@ fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::s impl Conf { pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result { + // Because when datadog is enabled the global sampling is overriden to always_on + if self + .exporters + .tracing + .common + .preview_datadog_agent_sampling + .unwrap_or_default() + { + let field_ratio = match &self.apollo.field_level_instrumentation_sampler { + SamplerOption::TraceIdRatioBased(ratio) => *ratio, + SamplerOption::Always(Sampler::AlwaysOn) => 1.0, + SamplerOption::Always(Sampler::AlwaysOff) => 0.0, + }; + + return Ok(field_ratio); + } Ok( match ( &self.exporters.tracing.common.sampler, diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 21f8993a93..e70b2cf55c 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -862,7 +862,21 @@ impl PluginPrivate for Telemetry { // Only apply things if we were executing in the context of a vanilla the Apollo executable. // Users that are rolling their own routers will need to set up telemetry themselves. if let Some(hot_tracer) = OPENTELEMETRY_TRACER_HANDLE.get() { - otel::layer::configure(&self.sampling_filter_ratio); + // If the datadog agent sampling is enabled, then we cannot presample the spans + // Therefore we set presampling to always on and let the regular sampler do the work. + // Effectively, we are disabling the presampling. + if self + .config + .exporters + .tracing + .common + .preview_datadog_agent_sampling + .unwrap_or_default() + { + otel::layer::configure(&SamplerOption::Always(Sampler::AlwaysOn)); + } else { + otel::layer::configure(&self.sampling_filter_ratio); + } // The reason that this has to happen here is that we are interacting with global state. // If we do this logic during plugin init then if a subsequent plugin fails to init then we @@ -885,7 +899,8 @@ impl PluginPrivate for Telemetry { Self::checked_global_tracer_shutdown(last_provider); - opentelemetry::global::set_text_map_propagator(Self::create_propagator(&self.config)); + let propagator = Self::create_propagator(&self.config); + opentelemetry::global::set_text_map_propagator(propagator); } activation.reload_metrics(); @@ -938,6 +953,9 @@ impl Telemetry { if propagation.aws_xray { propagators.push(Box::::default()); } + + // This propagator MUST come last because the user is trying to override the default behavior of the + // other propagators. if let Some(from_request_header) = &propagation.request.header_name { propagators.push(Box::new(CustomTraceIdPropagator::new( from_request_header.to_string(), @@ -955,9 +973,14 @@ impl Telemetry { let spans_config = &config.instrumentation.spans; let mut common = tracing_config.common.clone(); let mut sampler = common.sampler.clone(); - // set it to AlwaysOn: it is now done in the SamplingFilter, so whatever is sent to an exporter - // should be accepted - common.sampler = SamplerOption::Always(Sampler::AlwaysOn); + + // To enable pre-sampling to work we need to disable regular sampling. + // This is because the pre-sampler will sample the spans before they sent to the regular sampler + // If the datadog agent sampling is enabled, then we cannot pre-sample the spans because even if the sampling decision is made to drop + // DatadogAgentSampler will modify the decision to RecordAndSample and instead use the sampling.priority attribute to decide if the span should be sampled or not. + if !common.preview_datadog_agent_sampling.unwrap_or_default() { + common.sampler = SamplerOption::Always(Sampler::AlwaysOn); + } let mut builder = opentelemetry::sdk::trace::TracerProvider::builder().with_config((&common).into()); @@ -2132,6 +2155,8 @@ mod tests { use std::collections::HashMap; use std::fmt::Debug; use std::ops::DerefMut; + use std::sync::atomic::AtomicUsize; + use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; @@ -2189,6 +2214,7 @@ mod tests { use crate::plugins::demand_control::COST_STRATEGY_KEY; use crate::plugins::telemetry::config::TraceIdFormat; use crate::plugins::telemetry::handle_error_internal; + use crate::plugins::telemetry::EnableSubgraphFtv1; use crate::services::router::body::get_body_bytes; use crate::services::RouterRequest; use crate::services::RouterResponse; @@ -2834,6 +2860,63 @@ mod tests { .await; } + #[tokio::test] + async fn test_field_instrumentation_sampler_with_preview_datadog_agent_sampling() { + let plugin = create_plugin_with_config(include_str!( + "testdata/config.field_instrumentation_sampler.router.yaml" + )) + .await; + + let ftv1_counter = Arc::new(AtomicUsize::new(0)); + let ftv1_counter_cloned = ftv1_counter.clone(); + + let mut mock_request_service = MockSupergraphService::new(); + mock_request_service + .expect_call() + .times(10) + .returning(move |req: SupergraphRequest| { + if req + .context + .extensions() + .with_lock(|lock| lock.contains_key::()) + { + ftv1_counter_cloned.fetch_add(1, Ordering::Relaxed); + } + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::OK) + .header("content-type", "application/json") + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }); + let mut request_supergraph_service = + plugin.supergraph_service(BoxService::new(mock_request_service)); + + for _ in 0..10 { + let supergraph_req = SupergraphRequest::fake_builder() + .header("x-custom", "TEST") + .header("conditional-custom", "X") + .header("custom-length", "55") + .header("content-length", "55") + .header("content-type", "application/graphql") + .query("Query test { me {name} }") + .operation_name("test".to_string()); + let _router_response = request_supergraph_service + .ready() + .await + .unwrap() + .call(supergraph_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + } + // It should be 100% because when we set preview_datadog_agent_sampling, we only take the value of field_level_instrumentation_sampler + assert_eq!(ftv1_counter.load(Ordering::Relaxed), 10); + } + #[tokio::test] async fn test_subgraph_metrics_ok() { async { diff --git a/apollo-router/src/plugins/telemetry/otel/layer.rs b/apollo-router/src/plugins/telemetry/otel/layer.rs index 6beb2cc59c..241b253df4 100644 --- a/apollo-router/src/plugins/telemetry/otel/layer.rs +++ b/apollo-router/src/plugins/telemetry/otel/layer.rs @@ -678,13 +678,13 @@ pub(crate) fn configure(sampler: &SamplerOption) { }, }; - SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::Relaxed); + SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::SeqCst); } impl OpenTelemetryLayer { fn sample(&self) -> bool { let s: f64 = thread_rng().gen_range(0.0..=1.0); - s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::Relaxed)) + s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::SeqCst)) } } diff --git a/apollo-router/src/plugins/telemetry/otel/tracer.rs b/apollo-router/src/plugins/telemetry/otel/tracer.rs index 463fd8cb2c..6b11bab9ad 100644 --- a/apollo-router/src/plugins/telemetry/otel/tracer.rs +++ b/apollo-router/src/plugins/telemetry/otel/tracer.rs @@ -16,7 +16,6 @@ use opentelemetry_sdk::trace::Tracer as SdkTracer; use opentelemetry_sdk::trace::TracerProvider as SdkTracerProvider; use super::OtelData; -use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; /// An interface for authors of OpenTelemetry SDKs to build pre-sampled tracers. /// @@ -81,6 +80,7 @@ impl PreSampledTracer for SdkTracer { let parent_cx = &data.parent_cx; let builder = &mut data.builder; + // If we have a parent span that means we have a parent span coming from a propagator // Gather trace state let (trace_id, parent_trace_flags) = current_trace_state(builder, parent_cx, &provider); @@ -159,12 +159,7 @@ fn process_sampling_result( decision: SamplingDecision::RecordAndSample, trace_state, .. - } => Some(( - trace_flags | TraceFlags::SAMPLED, - trace_state - .with_priority_sampling(true) - .with_measuring(true), - )), + } => Some((trace_flags | TraceFlags::SAMPLED, trace_state.clone())), } } diff --git a/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml b/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml new file mode 100644 index 0000000000..54f4167b22 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml @@ -0,0 +1,11 @@ +telemetry: + instrumentation: + spans: + mode: spec_compliant + apollo: + field_level_instrumentation_sampler: 1.0 + exporters: + tracing: + common: + preview_datadog_agent_sampling: true + sampler: 0.5 \ No newline at end of file diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs new file mode 100644 index 0000000000..2fc04e94bd --- /dev/null +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs @@ -0,0 +1,376 @@ +use opentelemetry_api::trace::Link; +use opentelemetry_api::trace::SamplingDecision; +use opentelemetry_api::trace::SamplingResult; +use opentelemetry_api::trace::SpanKind; +use opentelemetry_api::trace::TraceId; +use opentelemetry_api::Key; +use opentelemetry_api::KeyValue; +use opentelemetry_api::OrderMap; +use opentelemetry_api::Value; +use opentelemetry_sdk::trace::ShouldSample; + +use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; +use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; + +/// The Datadog Agent Sampler +/// +/// This sampler overrides the sampling decision to ensure that spans are recorded even if they were originally dropped. +/// It performs the following tasks: +/// 1. Ensures the appropriate trace state is set +/// 2. Adds the sampling.priority attribute to the span +/// +/// The sampler can be configured to use parent-based sampling for consistent trace sampling. +/// +#[derive(Debug, Clone)] +pub(crate) struct DatadogAgentSampling { + /// The underlying sampler used for initial sampling decisions + pub(crate) sampler: opentelemetry::sdk::trace::Sampler, + /// Flag to enable parent-based sampling for consistent trace sampling + pub(crate) parent_based_sampler: bool, +} +impl DatadogAgentSampling { + /// Creates a new DatadogAgentSampling instance + /// + /// # Arguments + /// * `sampler` - The underlying sampler to use for initial sampling decisions + /// * `parent_based_sampler` - Whether to use parent-based sampling for consistent trace sampling + pub(crate) fn new( + sampler: opentelemetry::sdk::trace::Sampler, + parent_based_sampler: bool, + ) -> Self { + Self { + sampler, + parent_based_sampler, + } + } +} + +impl ShouldSample for DatadogAgentSampling { + fn should_sample( + &self, + parent_context: Option<&opentelemetry_api::Context>, + trace_id: TraceId, + name: &str, + span_kind: &SpanKind, + attributes: &OrderMap, + links: &[Link], + ) -> SamplingResult { + let mut result = self.sampler.should_sample( + parent_context, + trace_id, + name, + span_kind, + attributes, + links, + ); + // Override the sampling decision to record and make sure that the trace state is set correctly + // if either parent sampling is disabled or it has not been populated by a propagator. + // The propagator gets first dibs on setting the trace state, so if it sets it, we don't override it unless we are not parent based. + match result.decision { + SamplingDecision::Drop | SamplingDecision::RecordOnly => { + result.decision = SamplingDecision::RecordOnly; + if !self.parent_based_sampler || result.trace_state.sampling_priority().is_none() { + result.trace_state = result + .trace_state + .with_priority_sampling(SamplingPriority::AutoReject) + } + } + SamplingDecision::RecordAndSample => { + if !self.parent_based_sampler || result.trace_state.sampling_priority().is_none() { + result.trace_state = result + .trace_state + .with_priority_sampling(SamplingPriority::AutoKeep) + } + } + } + + // We always want to measure + result.trace_state = result.trace_state.with_measuring(true); + // We always want to set the sampling.priority attribute in case we are communicating with the agent via otlp. + // Reverse engineered from https://github.com/DataDog/datadog-agent/blob/c692f62423f93988b008b669008f9199a5ad196b/pkg/trace/api/otlp.go#L502 + result.attributes.push(KeyValue::new( + "sampling.priority", + Value::I64( + result + .trace_state + .sampling_priority() + .expect("sampling priority") + .as_i64(), + ), + )); + result + } +} +#[cfg(test)] +mod tests { + use buildstructor::Builder; + use opentelemetry::sdk::trace::Sampler; + use opentelemetry::trace::TraceState; + use opentelemetry_api::trace::Link; + use opentelemetry_api::trace::SamplingDecision; + use opentelemetry_api::trace::SamplingResult; + use opentelemetry_api::trace::SpanContext; + use opentelemetry_api::trace::SpanId; + use opentelemetry_api::trace::SpanKind; + use opentelemetry_api::trace::TraceContextExt; + use opentelemetry_api::trace::TraceFlags; + use opentelemetry_api::trace::TraceId; + use opentelemetry_api::Context; + use opentelemetry_api::Key; + use opentelemetry_api::OrderMap; + use opentelemetry_api::Value; + use opentelemetry_sdk::trace::ShouldSample; + + use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling; + use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; + use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; + + #[derive(Debug, Clone, Builder)] + struct StubSampler { + decision: SamplingDecision, + } + + impl ShouldSample for StubSampler { + fn should_sample( + &self, + _parent_context: Option<&Context>, + _trace_id: TraceId, + _name: &str, + _span_kind: &SpanKind, + _attributes: &OrderMap, + _links: &[Link], + ) -> SamplingResult { + SamplingResult { + decision: self.decision.clone(), + attributes: Vec::new(), + trace_state: Default::default(), + } + } + } + + #[test] + fn test_should_sample_drop() { + // Test case where the sampling decision is Drop + let sampler = StubSampler::builder() + .decision(SamplingDecision::Drop) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Verify that the decision is RecordOnly (converted from Drop) + assert_eq!(result.decision, SamplingDecision::RecordOnly); + // Verify that the sampling priority is set to AutoReject + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoReject) + ); + // Verify that the sampling.priority attribute is set correctly + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_record_only() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordOnly) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record only should remain as record only + assert_eq!(result.decision, SamplingDecision::RecordOnly); + + // Verify that the sampling priority is set to AutoReject so the trace won't be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_record_and_sample() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is set to AutoKeep so the trace will be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoKeep) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoKeep.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_with_parent_based_sampler() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new()), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is set to AutoKeep so the trace will be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoKeep) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoKeep.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_trace_state_already_populated_record_and_sample() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new().with_remote_span_context(SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::SAMPLED, + true, + TraceState::default().with_priority_sampling(SamplingPriority::UserReject), + ))), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is not overridden + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::UserReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::UserReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_trace_state_already_populated_record_drop() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::Drop) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new().with_remote_span_context(SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::default(), + true, + TraceState::default().with_priority_sampling(SamplingPriority::UserReject), + ))), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Drop is converted to RecordOnly + assert_eq!(result.decision, SamplingDecision::RecordOnly); + + // Verify that the sampling priority is not overridden + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::UserReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::UserReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs similarity index 93% rename from apollo-router/src/plugins/telemetry/tracing/datadog.rs rename to apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs index d0994fbf13..fd1b4447ae 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs @@ -1,15 +1,18 @@ //! Configuration for datadog tracing. +mod agent_sampling; +mod span_processor; + use std::fmt::Debug; use std::fmt::Formatter; use std::time::Duration; +pub(crate) use agent_sampling::DatadogAgentSampling; use ahash::HashMap; use ahash::HashMapExt; use futures::future::BoxFuture; use http::Uri; use opentelemetry::sdk; -use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; use opentelemetry::Value; use opentelemetry_api::trace::SpanContext; @@ -23,6 +26,7 @@ use opentelemetry_semantic_conventions::resource::SERVICE_NAME; use opentelemetry_semantic_conventions::resource::SERVICE_VERSION; use schemars::JsonSchema; use serde::Deserialize; +pub(crate) use span_processor::DatadogSpanProcessor; use tower::BoxError; use crate::plugins::telemetry::config::GenericWith; @@ -210,18 +214,24 @@ impl TracingConfigurator for Config { let mut span_metrics = default_span_metrics(); span_metrics.extend(self.span_metrics.clone()); - Ok(builder.with_span_processor( - BatchSpanProcessor::builder( - ExporterWrapper { - delegate: exporter, - span_metrics, - }, - opentelemetry::runtime::Tokio, - ) - .with_batch_config(self.batch_processor.clone().into()) - .build() - .filtered(), - )) + let batch_processor = opentelemetry::sdk::trace::BatchSpanProcessor::builder( + ExporterWrapper { + delegate: exporter, + span_metrics, + }, + opentelemetry::runtime::Tokio, + ) + .with_batch_config(self.batch_processor.clone().into()) + .build() + .filtered(); + + Ok( + if trace.preview_datadog_agent_sampling.unwrap_or_default() { + builder.with_span_processor(batch_processor.always_sampled()) + } else { + builder.with_span_processor(batch_processor) + }, + ) } } diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs new file mode 100644 index 0000000000..7c879c310a --- /dev/null +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs @@ -0,0 +1,133 @@ +use opentelemetry_api::trace::SpanContext; +use opentelemetry_api::trace::TraceResult; +use opentelemetry_api::Context; +use opentelemetry_sdk::export::trace::SpanData; +use opentelemetry_sdk::trace::Span; +use opentelemetry_sdk::trace::SpanProcessor; + +/// When using the Datadog agent we need spans to always be exported. However, the batch span processor will only export spans that are sampled. +/// This wrapper will override the trace flags to always sample. +/// THe datadog exporter itself will look at the `sampling.priority` trace context attribute to determine if the span should be sampled. +#[derive(Debug)] +pub(crate) struct DatadogSpanProcessor { + delegate: T, +} + +impl DatadogSpanProcessor { + pub(crate) fn new(delegate: T) -> Self { + Self { delegate } + } +} + +impl SpanProcessor for DatadogSpanProcessor { + fn on_start(&self, span: &mut Span, cx: &Context) { + self.delegate.on_start(span, cx) + } + + fn on_end(&self, mut span: SpanData) { + // Note that the trace state for measuring and sampling priority is handled in the AgentSampler + // The only purpose of this span processor is to ensure that a span can pass through a batch processor. + let new_trace_flags = span.span_context.trace_flags().with_sampled(true); + span.span_context = SpanContext::new( + span.span_context.trace_id(), + span.span_context.span_id(), + new_trace_flags, + span.span_context.is_remote(), + span.span_context.trace_state().clone(), + ); + self.delegate.on_end(span) + } + + fn force_flush(&self) -> TraceResult<()> { + self.delegate.force_flush() + } + + fn shutdown(&mut self) -> TraceResult<()> { + self.delegate.shutdown() + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::Mutex; + use std::time::SystemTime; + + use opentelemetry_api::trace::SpanId; + use opentelemetry_api::trace::SpanKind; + use opentelemetry_api::trace::TraceFlags; + use opentelemetry_api::trace::TraceId; + use opentelemetry_api::Context; + use opentelemetry_sdk::trace::EvictedHashMap; + use opentelemetry_sdk::trace::EvictedQueue; + use opentelemetry_sdk::trace::SpanProcessor; + + use super::*; + + #[derive(Debug, Clone)] + struct MockSpanProcessor { + spans: Arc>>, + } + + impl MockSpanProcessor { + fn new() -> Self { + Self { + spans: Default::default(), + } + } + } + + impl SpanProcessor for MockSpanProcessor { + fn on_start(&self, _span: &mut Span, _cx: &Context) {} + + fn on_end(&self, span: SpanData) { + self.spans.lock().unwrap().push(span); + } + + fn force_flush(&self) -> TraceResult<()> { + Ok(()) + } + + fn shutdown(&mut self) -> TraceResult<()> { + Ok(()) + } + } + + #[test] + fn test_on_end_updates_trace_flags() { + let mock_processor = MockSpanProcessor::new(); + let processor = DatadogSpanProcessor::new(mock_processor.clone()); + let span_context = SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::default(), + false, + Default::default(), + ); + let span_data = SpanData { + span_context, + parent_span_id: SpanId::from_u64(1), + span_kind: SpanKind::Client, + name: Default::default(), + start_time: SystemTime::now(), + end_time: SystemTime::now(), + attributes: EvictedHashMap::new(32, 32), + events: EvictedQueue::new(32), + links: EvictedQueue::new(32), + status: Default::default(), + resource: Default::default(), + instrumentation_lib: Default::default(), + }; + + processor.on_end(span_data.clone()); + + // Verify that the trace flags are updated to sampled + let updated_trace_flags = span_data.span_context.trace_flags().with_sampled(true); + let stored_spans = mock_processor.spans.lock().unwrap(); + assert_eq!(stored_spans.len(), 1); + assert_eq!( + stored_spans[0].span_context.trace_flags(), + updated_trace_flags + ); + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs index fd1590966e..e11bc9ed78 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs @@ -8,6 +8,7 @@ use super::unified_tags::UnifiedTags; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::intern::StringInterner; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::model::DD_MEASURED_KEY; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::model::SAMPLING_PRIORITY_KEY; +use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; use crate::plugins::telemetry::tracing::datadog_exporter::Error; use crate::plugins::telemetry::tracing::datadog_exporter::ModelConfig; @@ -129,10 +130,22 @@ fn write_unified_tag<'a>( } fn get_sampling_priority(span: &SpanData) -> f64 { - if span.span_context.trace_state().priority_sampling_enabled() { - 1.0 - } else { - 0.0 + match span + .span_context + .trace_state() + .sampling_priority() + .unwrap_or_else(|| { + // Datadog sampling has not been set, revert to traceflags + if span.span_context.trace_flags().is_sampled() { + SamplingPriority::AutoKeep + } else { + SamplingPriority::AutoReject + } + }) { + SamplingPriority::UserReject => -1.0, + SamplingPriority::AutoReject => 0.0, + SamplingPriority::AutoKeep => 1.0, + SamplingPriority::UserKeep => 2.0, } } diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs index 1c586d48c8..c8ee8c4425 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs @@ -158,6 +158,8 @@ pub use propagator::DatadogTraceState; pub use propagator::DatadogTraceStateBuilder; pub(crate) mod propagator { + use std::fmt::Display; + use once_cell::sync::Lazy; use opentelemetry::propagation::text_map_propagator::FieldIter; use opentelemetry::propagation::Extractor; @@ -177,9 +179,9 @@ pub(crate) mod propagator { const TRACE_FLAG_DEFERRED: TraceFlags = TraceFlags::new(0x02); const TRACE_STATE_PRIORITY_SAMPLING: &str = "psr"; - pub(crate) const TRACE_STATE_MEASURE: &str = "m"; - pub(crate) const TRACE_STATE_TRUE_VALUE: &str = "1"; - pub(crate) const TRACE_STATE_FALSE_VALUE: &str = "0"; + const TRACE_STATE_MEASURE: &str = "m"; + const TRACE_STATE_TRUE_VALUE: &str = "1"; + const TRACE_STATE_FALSE_VALUE: &str = "0"; static DATADOG_HEADER_FIELDS: Lazy<[String; 3]> = Lazy::new(|| { [ @@ -191,8 +193,8 @@ pub(crate) mod propagator { #[derive(Default)] pub struct DatadogTraceStateBuilder { - priority_sampling: bool, - measuring: bool, + sampling_priority: SamplingPriority, + measuring: Option, } fn boolean_to_trace_state_flag(value: bool) -> &'static str { @@ -209,33 +211,39 @@ pub(crate) mod propagator { #[allow(clippy::needless_update)] impl DatadogTraceStateBuilder { - pub fn with_priority_sampling(self, enabled: bool) -> Self { + pub fn with_priority_sampling(self, sampling_priority: SamplingPriority) -> Self { Self { - priority_sampling: enabled, + sampling_priority, ..self } } pub fn with_measuring(self, enabled: bool) -> Self { Self { - measuring: enabled, + measuring: Some(enabled), ..self } } pub fn build(self) -> TraceState { - let values = [ - ( - TRACE_STATE_MEASURE, - boolean_to_trace_state_flag(self.measuring), - ), - ( + if let Some(measuring) = self.measuring { + let values = [ + (TRACE_STATE_MEASURE, boolean_to_trace_state_flag(measuring)), + ( + TRACE_STATE_PRIORITY_SAMPLING, + &self.sampling_priority.to_string(), + ), + ]; + + TraceState::from_key_value(values).unwrap_or_default() + } else { + let values = [( TRACE_STATE_PRIORITY_SAMPLING, - boolean_to_trace_state_flag(self.priority_sampling), - ), - ]; + &self.sampling_priority.to_string(), + )]; - TraceState::from_key_value(values).unwrap_or_default() + TraceState::from_key_value(values).unwrap_or_default() + } } } @@ -244,9 +252,9 @@ pub(crate) mod propagator { fn measuring_enabled(&self) -> bool; - fn with_priority_sampling(&self, enabled: bool) -> TraceState; + fn with_priority_sampling(&self, sampling_priority: SamplingPriority) -> TraceState; - fn priority_sampling_enabled(&self) -> bool; + fn sampling_priority(&self) -> Option; } impl DatadogTraceState for TraceState { @@ -261,30 +269,77 @@ pub(crate) mod propagator { .unwrap_or_default() } - fn with_priority_sampling(&self, enabled: bool) -> TraceState { - self.insert( - TRACE_STATE_PRIORITY_SAMPLING, - boolean_to_trace_state_flag(enabled), - ) - .unwrap_or_else(|_err| self.clone()) + fn with_priority_sampling(&self, sampling_priority: SamplingPriority) -> TraceState { + self.insert(TRACE_STATE_PRIORITY_SAMPLING, sampling_priority.to_string()) + .unwrap_or_else(|_err| self.clone()) } - fn priority_sampling_enabled(&self) -> bool { - self.get(TRACE_STATE_PRIORITY_SAMPLING) - .map(trace_flag_to_boolean) - .unwrap_or_default() + fn sampling_priority(&self) -> Option { + self.get(TRACE_STATE_PRIORITY_SAMPLING).map(|value| { + SamplingPriority::try_from(value).unwrap_or(SamplingPriority::AutoReject) + }) } } - enum SamplingPriority { + #[derive(Default, Debug, Eq, PartialEq)] + pub(crate) enum SamplingPriority { UserReject = -1, + #[default] AutoReject = 0, AutoKeep = 1, UserKeep = 2, } + impl SamplingPriority { + pub(crate) fn as_i64(&self) -> i64 { + match self { + SamplingPriority::UserReject => -1, + SamplingPriority::AutoReject => 0, + SamplingPriority::AutoKeep => 1, + SamplingPriority::UserKeep => 2, + } + } + } + + impl Display for SamplingPriority { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let value = match self { + SamplingPriority::UserReject => -1, + SamplingPriority::AutoReject => 0, + SamplingPriority::AutoKeep => 1, + SamplingPriority::UserKeep => 2, + }; + write!(f, "{}", value) + } + } + + impl SamplingPriority { + pub fn as_str(&self) -> &'static str { + match self { + SamplingPriority::UserReject => "-1", + SamplingPriority::AutoReject => "0", + SamplingPriority::AutoKeep => "1", + SamplingPriority::UserKeep => "2", + } + } + } + + impl TryFrom<&str> for SamplingPriority { + type Error = ExtractError; + + fn try_from(value: &str) -> Result { + match value { + "-1" => Ok(SamplingPriority::UserReject), + "0" => Ok(SamplingPriority::AutoReject), + "1" => Ok(SamplingPriority::AutoKeep), + "2" => Ok(SamplingPriority::UserKeep), + _ => Err(ExtractError::SamplingPriority), + } + } + } + #[derive(Debug)] - enum ExtractError { + pub(crate) enum ExtractError { TraceId, SpanId, SamplingPriority, @@ -311,16 +366,7 @@ pub(crate) mod propagator { } fn create_trace_state_and_flags(trace_flags: TraceFlags) -> (TraceState, TraceFlags) { - if trace_flags & TRACE_FLAG_DEFERRED == TRACE_FLAG_DEFERRED { - (TraceState::default(), trace_flags) - } else { - ( - DatadogTraceStateBuilder::default() - .with_priority_sampling(trace_flags.is_sampled()) - .build(), - TraceFlags::SAMPLED, - ) - } + (TraceState::default(), trace_flags) } impl DatadogPropagator { @@ -343,23 +389,6 @@ pub(crate) mod propagator { .map_err(|_| ExtractError::SpanId) } - fn extract_sampling_priority( - &self, - sampling_priority: &str, - ) -> Result { - let i = sampling_priority - .parse::() - .map_err(|_| ExtractError::SamplingPriority)?; - - match i { - -1 => Ok(SamplingPriority::UserReject), - 0 => Ok(SamplingPriority::AutoReject), - 1 => Ok(SamplingPriority::AutoKeep), - 2 => Ok(SamplingPriority::UserKeep), - _ => Err(ExtractError::SamplingPriority), - } - } - fn extract_span_context( &self, extractor: &dyn Extractor, @@ -371,11 +400,11 @@ pub(crate) mod propagator { let span_id = self .extract_span_id(extractor.get(DATADOG_PARENT_ID_HEADER).unwrap_or("")) .unwrap_or(SpanId::INVALID); - let sampling_priority = self.extract_sampling_priority( - extractor - .get(DATADOG_SAMPLING_PRIORITY_HEADER) - .unwrap_or(""), - ); + let sampling_priority = extractor + .get(DATADOG_SAMPLING_PRIORITY_HEADER) + .unwrap_or("") + .try_into(); + let sampled = match sampling_priority { Ok(SamplingPriority::UserReject) | Ok(SamplingPriority::AutoReject) => { TraceFlags::default() @@ -387,7 +416,10 @@ pub(crate) mod propagator { Err(_) => TRACE_FLAG_DEFERRED, }; - let (trace_state, trace_flags) = create_trace_state_and_flags(sampled); + let (mut trace_state, trace_flags) = create_trace_state_and_flags(sampled); + if let Ok(sampling_priority) = sampling_priority { + trace_state = trace_state.with_priority_sampling(sampling_priority); + } Ok(SpanContext::new( trace_id, @@ -399,14 +431,6 @@ pub(crate) mod propagator { } } - fn get_sampling_priority(span_context: &SpanContext) -> SamplingPriority { - if span_context.trace_state().priority_sampling_enabled() { - SamplingPriority::AutoKeep - } else { - SamplingPriority::AutoReject - } - } - impl TextMapPropagator for DatadogPropagator { fn inject_context(&self, cx: &Context, injector: &mut dyn Injector) { let span = cx.span(); @@ -422,8 +446,17 @@ pub(crate) mod propagator { ); if span_context.trace_flags() & TRACE_FLAG_DEFERRED != TRACE_FLAG_DEFERRED { - let sampling_priority = get_sampling_priority(span_context); - + // The sampling priority + let sampling_priority = span_context + .trace_state() + .sampling_priority() + .unwrap_or_else(|| { + if span_context.is_sampled() { + SamplingPriority::AutoKeep + } else { + SamplingPriority::AutoReject + } + }); injector.set( DATADOG_SAMPLING_PRIORITY_HEADER, (sampling_priority as i32).to_string(), @@ -460,8 +493,10 @@ pub(crate) mod propagator { (vec![(DATADOG_TRACE_ID_HEADER, "garbage")], SpanContext::empty_context()), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "garbage")], SpanContext::new(TraceId::from_u128(1234), SpanId::INVALID, TRACE_FLAG_DEFERRED, true, TraceState::default())), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TRACE_FLAG_DEFERRED, true, TraceState::default())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(false).build())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(true).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "-1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoKeep).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "2")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserKeep).build())), ] } @@ -473,8 +508,10 @@ pub(crate) mod propagator { (vec![], SpanContext::new(TraceId::from_hex("1234").unwrap(), SpanId::INVALID, TRACE_FLAG_DEFERRED, true, TraceState::default())), (vec![], SpanContext::new(TraceId::from_hex("1234").unwrap(), SpanId::INVALID, TraceFlags::SAMPLED, true, TraceState::default())), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TRACE_FLAG_DEFERRED, true, TraceState::default())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(false).build())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(true).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "-1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoKeep).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "2")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserKeep).build())), ] } diff --git a/apollo-router/src/plugins/telemetry/tracing/mod.rs b/apollo-router/src/plugins/telemetry/tracing/mod.rs index 0172f3e094..d2dc62b138 100644 --- a/apollo-router/src/plugins/telemetry/tracing/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/mod.rs @@ -18,6 +18,7 @@ use tower::BoxError; use super::config_new::spans::Spans; use super::formatters::APOLLO_PRIVATE_PREFIX; use crate::plugins::telemetry::config::TracingCommon; +use crate::plugins::telemetry::tracing::datadog::DatadogSpanProcessor; pub(crate) mod apollo; pub(crate) mod apollo_telemetry; @@ -91,6 +92,7 @@ where Self: Sized + SpanProcessor, { fn filtered(self) -> ApolloFilterSpanProcessor; + fn always_sampled(self) -> DatadogSpanProcessor; } impl SpanProcessorExt for T @@ -100,6 +102,12 @@ where fn filtered(self) -> ApolloFilterSpanProcessor { ApolloFilterSpanProcessor { delegate: self } } + + /// This span processor will always send spans to the exporter even if they are not sampled. This is useful for the datadog agent which + /// uses spans for metrics. + fn always_sampled(self) -> DatadogSpanProcessor { + DatadogSpanProcessor::new(self) + } } /// Batch processor configuration diff --git a/apollo-router/src/plugins/telemetry/tracing/otlp.rs b/apollo-router/src/plugins/telemetry/tracing/otlp.rs index be294427f2..9a61075e5f 100644 --- a/apollo-router/src/plugins/telemetry/tracing/otlp.rs +++ b/apollo-router/src/plugins/telemetry/tracing/otlp.rs @@ -20,20 +20,23 @@ impl TracingConfigurator for super::super::otlp::Config { fn apply( &self, builder: Builder, - _common: &TracingCommon, + common: &TracingCommon, _spans_config: &Spans, ) -> Result { - tracing::info!("Configuring Otlp tracing: {}", self.batch_processor); let exporter: SpanExporterBuilder = self.exporter(TelemetryDataKind::Traces)?; - - Ok(builder.with_span_processor( - BatchSpanProcessor::builder( - exporter.build_span_exporter()?, - opentelemetry::runtime::Tokio, - ) - .with_batch_config(self.batch_processor.clone().into()) - .build() - .filtered(), - )) + let batch_span_processor = BatchSpanProcessor::builder( + exporter.build_span_exporter()?, + opentelemetry::runtime::Tokio, + ) + .with_batch_config(self.batch_processor.clone().into()) + .build() + .filtered(); + Ok( + if common.preview_datadog_agent_sampling.unwrap_or_default() { + builder.with_span_processor(batch_span_processor.always_sampled()) + } else { + builder.with_span_processor(batch_span_processor) + }, + ) } } diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 7208a85c10..e70cf5f0de 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -18,7 +18,6 @@ use fred::types::Scanner; use futures::StreamExt; use http::header::ACCEPT; use http::header::CONTENT_TYPE; -use http::HeaderValue; use mediatype::names::BOUNDARY; use mediatype::names::FORM_DATA; use mediatype::names::MULTIPART; @@ -33,6 +32,7 @@ use opentelemetry::sdk::trace::TracerProvider; use opentelemetry::sdk::Resource; use opentelemetry::testing::trace::NoopSpanExporter; use opentelemetry::trace::TraceContextExt; +use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceId; use opentelemetry_api::trace::TracerProvider as OtherTracerProvider; use opentelemetry_api::Context; @@ -69,6 +69,75 @@ use wiremock::Mock; use wiremock::Respond; use wiremock::ResponseTemplate; +pub struct Query { + traced: bool, + psr: Option<&'static str>, + headers: HashMap, + content_type: String, + body: Value, +} + +impl Default for Query { + fn default() -> Self { + Query::builder().build() + } +} + +#[buildstructor::buildstructor] +impl Query { + #[builder] + pub fn new( + traced: Option, + psr: Option<&'static str>, + body: Option, + content_type: Option, + headers: HashMap, + ) -> Self { + Self { + traced: traced.unwrap_or(true), + psr, + body: body.unwrap_or( + json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}), + ), + content_type: content_type + .unwrap_or_else(|| APPLICATION_JSON.essence_str().to_string()), + headers, + } + } +} +impl Query { + #[allow(dead_code)] + pub fn with_bad_content_type(mut self) -> Self { + self.content_type = "garbage".to_string(); + self + } + + #[allow(dead_code)] + pub fn with_bad_query(mut self) -> Self { + self.body = json!({"garbage":{}}); + self + } + + #[allow(dead_code)] + pub fn with_anonymous(mut self) -> Self { + self.body = json!({"query":"query {topProducts{name}}","variables":{}}); + self + } + + #[allow(dead_code)] + pub fn with_huge_query(mut self) -> Self { + self.body = json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}); + self + } + + #[allow(dead_code)] + pub fn introspection() -> Query { + Query::builder() + .body(json!({"query":"{__schema {types {name}}}","variables":{}})) + .build() + } +} + pub struct IntegrationTest { router: Option, test_config_location: PathBuf, @@ -79,6 +148,7 @@ pub struct IntegrationTest { collect_stdio: Option<(tokio::sync::oneshot::Sender, regex::Regex)>, _subgraphs: wiremock::MockServer, telemetry: Telemetry, + extra_propagator: Telemetry, pub _tracer_provider_client: TracerProvider, pub _tracer_provider_subgraph: TracerProvider, @@ -88,6 +158,7 @@ pub struct IntegrationTest { bind_address: Arc>>, redis_namespace: String, log: String, + subgraph_context: Arc>>, } impl IntegrationTest { @@ -102,13 +173,19 @@ impl IntegrationTest { struct TracedResponder { response_template: ResponseTemplate, telemetry: Telemetry, + extra_propagator: Telemetry, subscriber_subgraph: Dispatch, subgraph_callback: Option>, + subgraph_context: Arc>>, } impl Respond for TracedResponder { fn respond(&self, request: &wiremock::Request) -> ResponseTemplate { - let context = self.telemetry.extract_context(request); + let context = self.telemetry.extract_context(request, &Context::new()); + let context = self.extra_propagator.extract_context(request, &context); + + *self.subgraph_context.lock().expect("lock poisoned") = + Some(context.span().span_context().clone()); tracing_core::dispatcher::with_default(&self.subscriber_subgraph, || { let _context_guard = context.attach(); let span = info_span!("subgraph server"); @@ -126,7 +203,7 @@ impl Respond for TracedResponder { pub enum Telemetry { Jaeger, Otlp { - endpoint: String, + endpoint: Option, }, Datadog, Zipkin, @@ -156,7 +233,9 @@ impl Telemetry { .build(), ) .build(), - Telemetry::Otlp { endpoint } => TracerProvider::builder() + Telemetry::Otlp { + endpoint: Some(endpoint), + } => TracerProvider::builder() .with_config(config) .with_span_processor( BatchSpanProcessor::builder( @@ -201,7 +280,7 @@ impl Telemetry { .build(), ) .build(), - Telemetry::None => TracerProvider::builder() + Telemetry::None | Telemetry::Otlp { endpoint: None } => TracerProvider::builder() .with_config(config) .with_simple_exporter(NoopSpanExporter::default()) .build(), @@ -220,11 +299,23 @@ impl Telemetry { ) } Telemetry::Datadog => { + // Get the existing PSR header if it exists. This is because the existing telemetry propagator doesn't support PSR properly yet. + // In testing we are manually setting the PSR header, and we don't want to override it. + let psr = request + .headers() + .get("x-datadog-sampling-priority") + .cloned(); let propagator = opentelemetry_datadog::DatadogPropagator::new(); propagator.inject_context( &ctx, &mut opentelemetry_http::HeaderInjector(request.headers_mut()), - ) + ); + + if let Some(psr) = psr { + request + .headers_mut() + .insert("x-datadog-sampling-priority", psr); + } } Telemetry::Otlp { .. } => { let propagator = opentelemetry::sdk::propagation::TraceContextPropagator::default(); @@ -244,7 +335,11 @@ impl Telemetry { } } - pub(crate) fn extract_context(&self, request: &wiremock::Request) -> Context { + pub(crate) fn extract_context( + &self, + request: &wiremock::Request, + context: &Context, + ) -> Context { let headers: HashMap = request .headers .iter() @@ -254,21 +349,46 @@ impl Telemetry { match self { Telemetry::Jaeger => { let propagator = opentelemetry_jaeger::Propagator::new(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } Telemetry::Datadog => { + let span_ref = context.span(); + let original_span_context = span_ref.span_context(); let propagator = opentelemetry_datadog::DatadogPropagator::new(); - propagator.extract(&headers) + let mut context = propagator.extract_with_context(context, &headers); + // We're going to override the sampled so that we can test sampling priority + if let Some(psr) = headers.get("x-datadog-sampling-priority") { + let state = context + .span() + .span_context() + .trace_state() + .insert("psr", psr.to_string()) + .expect("psr"); + let new_trace_id = if original_span_context.is_valid() { + original_span_context.trace_id() + } else { + context.span().span_context().trace_id() + }; + context = context.with_remote_span_context(SpanContext::new( + new_trace_id, + context.span().span_context().span_id(), + context.span().span_context().trace_flags(), + true, + state, + )); + } + + context } Telemetry::Otlp { .. } => { let propagator = opentelemetry::sdk::propagation::TraceContextPropagator::default(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } Telemetry::Zipkin => { let propagator = opentelemetry_zipkin::Propagator::new(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } - _ => Context::current(), + _ => context.clone(), } } } @@ -280,6 +400,7 @@ impl IntegrationTest { pub async fn new( config: String, telemetry: Option, + extra_propagator: Option, responder: Option, collect_stdio: Option>, supergraph: Option, @@ -289,6 +410,7 @@ impl IntegrationTest { ) -> Self { let redis_namespace = Uuid::new_v4().to_string(); let telemetry = telemetry.unwrap_or_default(); + let extra_propagator = extra_propagator.unwrap_or_default(); let tracer_provider_client = telemetry.tracer_provider("client"); let subscriber_client = Self::dispatch(&tracer_provider_client); let tracer_provider_subgraph = telemetry.tracer_provider("subgraph"); @@ -314,12 +436,15 @@ impl IntegrationTest { .start() .await; + let subgraph_context = Arc::new(Mutex::new(None)); Mock::given(method("POST")) .respond_with(TracedResponder{response_template:responder.unwrap_or_else(|| ResponseTemplate::new(200).set_body_json(json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}))), telemetry: telemetry.clone(), + extra_propagator: extra_propagator.clone(), subscriber_subgraph: Self::dispatch(&tracer_provider_subgraph), - subgraph_callback + subgraph_callback, + subgraph_context: subgraph_context.clone() }) .mount(&subgraphs) .await; @@ -354,8 +479,10 @@ impl IntegrationTest { subscriber_client, _tracer_provider_subgraph: tracer_provider_subgraph, telemetry, + extra_propagator, redis_namespace, log: log.unwrap_or_else(|| "error,apollo_router=info".to_owned()), + subgraph_context, } } @@ -373,6 +500,16 @@ impl IntegrationTest { Dispatch::new(subscriber) } + #[allow(dead_code)] + pub fn subgraph_context(&self) -> SpanContext { + self.subgraph_context + .lock() + .expect("lock poisoned") + .as_ref() + .unwrap() + .clone() + } + pub fn router_location() -> PathBuf { PathBuf::from(env!("CARGO_BIN_EXE_router")) } @@ -503,93 +640,61 @@ impl IntegrationTest { pub fn execute_default_query( &self, ) -> impl std::future::Future { - self.execute_query_internal( - &json!({"query":"query {topProducts{name}}","variables":{}}), - None, - None, - ) + self.execute_query(Query::builder().build()) } #[allow(dead_code)] pub fn execute_query( &self, - query: &Value, - ) -> impl std::future::Future { - self.execute_query_internal(query, None, None) - } - - #[allow(dead_code)] - pub fn execute_bad_query( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"garbage":{}}), None, None) - } - - #[allow(dead_code)] - pub fn execute_huge_query( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}), None, None) - } - - #[allow(dead_code)] - pub fn execute_bad_content_type( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"garbage":{}}), Some("garbage"), None) - } - - #[allow(dead_code)] - pub fn execute_query_with_headers( - &self, - query: &Value, - headers: HashMap, - ) -> impl std::future::Future { - self.execute_query_internal(query, None, Some(headers)) - } - - fn execute_query_internal( - &self, - query: &Value, - content_type: Option<&'static str>, - headers: Option>, + query: Query, ) -> impl std::future::Future { assert!( self.router.is_some(), "router was not started, call `router.start().await; router.assert_started().await`" ); let telemetry = self.telemetry.clone(); + let extra_propagator = self.extra_propagator.clone(); - let query = query.clone(); let url = format!("http://{}", self.bind_address()); - + let subgraph_context = self.subgraph_context.clone(); async move { let span = info_span!("client_request"); - let span_id = span.context().span().span_context().trace_id(); + let trace_id = span.context().span().span_context().trace_id(); async move { let client = reqwest::Client::new(); - let mut builder = client - .post(url) - .header( - CONTENT_TYPE, - content_type.unwrap_or(APPLICATION_JSON.essence_str()), - ) - .header("apollographql-client-name", "custom_name") - .header("apollographql-client-version", "1.0") - .header("x-my-header", "test") - .header("head", "test"); + let mut builder = client.post(url).header(CONTENT_TYPE, query.content_type); - if let Some(headers) = headers { - for (name, value) in headers { - builder = builder.header(name, value); - } + for (name, value) in query.headers { + builder = builder.header(name, value); + } + + if let Some(psr) = query.psr { + builder = builder.header("x-datadog-sampling-priority", psr); + } + + let mut request = builder.json(&query.body).build().unwrap(); + if query.traced { + telemetry.inject_context(&mut request); + extra_propagator.inject_context(&mut request); } - let mut request = builder.json(&query).build().unwrap(); - telemetry.inject_context(&mut request); match client.execute(request).await { - Ok(response) => (span_id, response), + Ok(response) => { + if query.traced { + (trace_id, response) + } else { + ( + subgraph_context + .lock() + .expect("poisoned") + .as_ref() + .expect("subgraph context") + .trace_id(), + response, + ) + } + } Err(err) => { panic!("unable to send successful request to router, {err}") } @@ -601,53 +706,6 @@ impl IntegrationTest { .with_subscriber(self.subscriber_client.clone()) } - #[allow(dead_code)] - pub fn execute_untraced_query( - &self, - query: &Value, - ) -> impl std::future::Future { - assert!( - self.router.is_some(), - "router was not started, call `router.start().await; router.assert_started().await`" - ); - let query = query.clone(); - let url = format!("http://{}", self.bind_address()); - - async move { - let client = reqwest::Client::new(); - - let mut request = client - .post(url) - .header(CONTENT_TYPE, APPLICATION_JSON.essence_str()) - .header("apollographql-client-name", "custom_name") - .header("apollographql-client-version", "1.0") - .json(&query) - .build() - .unwrap(); - - request.headers_mut().remove(ACCEPT); - match client.execute(request).await { - Ok(response) => ( - TraceId::from_hex( - response - .headers() - .get("apollo-custom-trace-id") - .cloned() - .unwrap_or(HeaderValue::from_static("no-trace-id")) - .to_str() - .unwrap_or_default(), - ) - .unwrap_or(TraceId::INVALID), - response, - ), - Err(err) => { - panic!("unable to send successful request to router, {err}") - } - } - } - .with_subscriber(self.subscriber_client.clone()) - } - /// Make a raw multipart request to the router. #[allow(dead_code)] pub fn execute_multipart_request( diff --git a/apollo-router/tests/integration/batching.rs b/apollo-router/tests/integration/batching.rs index 15dfd38de2..521e615b30 100644 --- a/apollo-router/tests/integration/batching.rs +++ b/apollo-router/tests/integration/batching.rs @@ -857,6 +857,7 @@ mod helper { use super::test_is_enabled; use crate::integration::common::IntegrationTest; + use crate::integration::common::Query; /// Helper type for specifying a valid handler pub type Handler = fn(&wiremock::Request) -> ResponseTemplate; @@ -916,7 +917,9 @@ mod helper { // Execute the request let request = serde_json::to_value(requests)?; - let (_span, response) = router.execute_query(&request).await; + let (_span, response) = router + .execute_query(Query::builder().body(request).build()) + .await; serde_json::from_slice::>(&response.bytes().await?).map_err(BoxError::from) } diff --git a/apollo-router/tests/integration/coprocessor.rs b/apollo-router/tests/integration/coprocessor.rs index d9ce741892..d82d15ca7c 100644 --- a/apollo-router/tests/integration/coprocessor.rs +++ b/apollo-router/tests/integration/coprocessor.rs @@ -8,6 +8,7 @@ use wiremock::Mock; use wiremock::ResponseTemplate; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -43,7 +44,7 @@ async fn test_coprocessor_limit_payload() -> Result<(), BoxError> { // Expect a small query Mock::given(method("POST")) .and(path("/")) - .and(body_partial_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query {topProducts{name}}\",\"variables\":{}}","method":"POST"}))) + .and(body_partial_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query ExampleQuery {topProducts{name}}\",\"variables\":{}}","method":"POST"}))) .respond_with( ResponseTemplate::new(200).set_body_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query {topProducts{name}}\",\"variables\":{}}","method":"POST"})), ) @@ -75,7 +76,9 @@ async fn test_coprocessor_limit_payload() -> Result<(), BoxError> { assert_eq!(response.status(), 200); // This query is huge and will be rejected because it is too large before hitting the coprocessor - let (_trace_id, response) = router.execute_huge_query().await; + let (_trace_id, response) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(response.status(), 413); assert_yaml_snapshot!(response.text().await?); diff --git a/apollo-router/tests/integration/introspection.rs b/apollo-router/tests/integration/introspection.rs index 95c8ad9c8c..8ad142a9cb 100644 --- a/apollo-router/tests/integration/introspection.rs +++ b/apollo-router/tests/integration/introspection.rs @@ -3,6 +3,7 @@ use apollo_router::services::supergraph::Request; use serde_json::json; use tower::ServiceExt; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test] @@ -226,7 +227,9 @@ async fn integration() { let query = json!({ "query": include_str!("../fixtures/introspect_full_schema.graphql"), }); - let (_trace_id, response) = router.execute_query(&query).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(query).build()) + .await; insta::assert_json_snapshot!(response.json::().await.unwrap()); router.graceful_shutdown().await; } diff --git a/apollo-router/tests/integration/mod.rs b/apollo-router/tests/integration/mod.rs index 7e775a21a9..d287b894fc 100644 --- a/apollo-router/tests/integration/mod.rs +++ b/apollo-router/tests/integration/mod.rs @@ -40,3 +40,12 @@ impl ValueExt for Value { self.as_str().map(|s| s.to_string()) } } + +impl ValueExt for &Value { + fn select_path<'a>(&'a self, path: &str) -> Result, BoxError> { + Ok(Selector::new().str_path(path)?.value(self).select()?) + } + fn as_string(&self) -> Option { + self.as_str().map(|s| s.to_string()) + } +} diff --git a/apollo-router/tests/integration/operation_limits.rs b/apollo-router/tests/integration/operation_limits.rs index 79ad7d9f89..1b6b186e41 100644 --- a/apollo-router/tests/integration/operation_limits.rs +++ b/apollo-router/tests/integration/operation_limits.rs @@ -10,6 +10,7 @@ use serde_json::json; use tower::BoxError; use tower::ServiceExt; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -310,7 +311,9 @@ async fn test_request_bytes_limit_with_coprocessor() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_huge_query().await; + let (_, resp) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) @@ -324,7 +327,9 @@ async fn test_request_bytes_limit() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_huge_query().await; + let (_, resp) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) diff --git a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs index 4e55f37757..6326d600ee 100644 --- a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs +++ b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs @@ -1,5 +1,6 @@ use serde_json::json; +use crate::integration::common::Query; use crate::integration::IntegrationTest; fn assert_evaluated_plans(prom: &str, expected: u64) { @@ -31,10 +32,14 @@ async fn reports_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(&json!({ - "query": r#"{ t { v1 v2 v3 v4 } }"#, - "variables": {}, - })) + .execute_query( + Query::builder() + .body(json!({ + "query": r#"{ t { v1 v2 v3 v4 } }"#, + "variables": {}, + })) + .build(), + ) .await; let metrics = router @@ -70,10 +75,14 @@ async fn does_not_exceed_max_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(&json!({ - "query": r#"{ t { v1 v2 v3 v4 } }"#, - "variables": {}, - })) + .execute_query( + Query::builder() + .body(json!({ + "query": r#"{ t { v1 v2 v3 v4 } }"#, + "variables": {}, + })) + .build(), + ) .await; let metrics = router diff --git a/apollo-router/tests/integration/redis.rs b/apollo-router/tests/integration/redis.rs index b7fb1cbf58..07d16d7b92 100644 --- a/apollo-router/tests/integration/redis.rs +++ b/apollo-router/tests/integration/redis.rs @@ -41,6 +41,7 @@ use tower::BoxError; use tower::ServiceExt; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -1072,11 +1073,15 @@ async fn test_redis_query_plan_config_update(updated_config: &str, new_cache_key ); assert_ne!(starting_key, new_cache_key, "starting_key (cache key for the initial config) and new_cache_key (cache key with the updated config) should not be equal. This either means that the cache key is not being generated correctly, or that the test is not actually checking the updated key."); - router.execute_default_query().await; + router + .execute_query(Query::default().with_anonymous()) + .await; router.assert_redis_cache_contains(starting_key, None).await; router.update_config(updated_config).await; router.assert_reloaded().await; - router.execute_default_query().await; + router + .execute_query(Query::default().with_anonymous()) + .await; router .assert_redis_cache_contains(new_cache_key, Some(starting_key)) .await; diff --git a/apollo-router/tests/integration/subgraph_response.rs b/apollo-router/tests/integration/subgraph_response.rs index 5272b74ace..3f0f194d92 100644 --- a/apollo-router/tests/integration/subgraph_response.rs +++ b/apollo-router/tests/integration/subgraph_response.rs @@ -2,6 +2,7 @@ use serde_json::json; use tower::BoxError; use wiremock::ResponseTemplate; +use crate::integration::common::Query; use crate::integration::IntegrationTest; const CONFIG: &str = r#" @@ -21,7 +22,9 @@ async fn test_subgraph_returning_data_null() -> Result<(), BoxError> { router.assert_started().await; let query = "{ __typename topProducts { name } }"; - let (_trace_id, response) = router.execute_query(&json!({ "query": query })).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(json!({ "query": query })).build()) + .await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -64,7 +67,9 @@ async fn test_subgraph_returning_different_typename_on_query_root() -> Result<() inside_fragment: __typename } "#; - let (_trace_id, response) = router.execute_query(&json!({ "query": query })).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(json!({ "query": query })).build()) + .await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -99,7 +104,11 @@ async fn test_valid_extensions_service_for_subgraph_error() -> Result<(), BoxErr router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -141,7 +150,11 @@ async fn test_valid_extensions_service_is_preserved_for_subgraph_error() -> Resu router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -174,7 +187,11 @@ async fn test_valid_extensions_service_for_invalid_subgraph_response() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -222,7 +239,11 @@ async fn test_valid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -264,7 +285,11 @@ async fn test_empty_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -302,7 +327,11 @@ async fn test_invalid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -345,7 +374,11 @@ async fn test_invalid_error_locations_with_single_negative_one_location() -> Res router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -387,7 +420,11 @@ async fn test_invalid_error_locations_contains_negative_one_location() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -427,9 +464,7 @@ async fn test_valid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -464,9 +499,7 @@ async fn test_invalid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -502,9 +535,7 @@ async fn test_partially_valid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, diff --git a/apollo-router/tests/integration/subscription.rs b/apollo-router/tests/integration/subscription.rs index 911503593f..faad126f8e 100644 --- a/apollo-router/tests/integration/subscription.rs +++ b/apollo-router/tests/integration/subscription.rs @@ -5,6 +5,7 @@ use serde_json::json; use tower::BoxError; use super::common::IntegrationTest; +use super::common::Query; use super::common::Telemetry; const SUBSCRIPTION_CONFIG: &str = include_str!("../fixtures/subscription.router.yaml"); @@ -60,7 +61,9 @@ async fn test_subscription_load() -> Result<(), BoxError> { for _ in 0..100 { let (_id, resp) = router .execute_query( - &json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}), + Query::builder() + .body(json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}})) + .build(), ) .await; assert!(resp.status().is_success()); diff --git a/apollo-router/tests/integration/supergraph.rs b/apollo-router/tests/integration/supergraph.rs index 97d5131d84..07b4c81089 100644 --- a/apollo-router/tests/integration/supergraph.rs +++ b/apollo-router/tests/integration/supergraph.rs @@ -1,8 +1,10 @@ +#[cfg(feature = "hyper_header_limits")] use std::collections::HashMap; use serde_json::json; use tower::BoxError; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[cfg(not(feature = "hyper_header_limits"))] @@ -46,7 +48,12 @@ async fn test_supergraph_errors_on_http1_max_headers() -> Result<(), BoxError> { } let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .headers(headers) + .build(), + ) .await; assert_eq!(response.status(), 431); Ok(()) @@ -74,7 +81,12 @@ async fn test_supergraph_allow_to_change_http1_max_headers() -> Result<(), BoxEr } let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .headers(headers) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -100,11 +112,13 @@ async fn test_supergraph_errors_on_http1_header_that_does_not_fit_inside_buffer( router.start().await; router.assert_started().await; - let mut headers = HashMap::new(); - headers.insert("test-header".to_string(), "x".repeat(1048576 + 1)); - let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .header("test-header", "x".repeat(1048576 + 1)) + .build(), + ) .await; assert_eq!(response.status(), 431); Ok(()) @@ -125,11 +139,13 @@ async fn test_supergraph_allow_to_change_http1_max_buf_size() -> Result<(), BoxE router.start().await; router.assert_started().await; - let mut headers = HashMap::new(); - headers.insert("test-header".to_string(), "x".repeat(1048576 + 1)); - let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .header("test-header", "x".repeat(1048576 + 1)) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 6aed76ff6d..db33307ae9 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -1,63 +1,451 @@ extern crate core; -use std::collections::HashMap; use std::collections::HashSet; -use std::sync::atomic::AtomicBool; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; -use opentelemetry_api::trace::TraceContextExt; use opentelemetry_api::trace::TraceId; -use serde_json::json; use serde_json::Value; use tower::BoxError; -use tracing::Span; -use tracing_opentelemetry::OpenTelemetrySpanExt; -use wiremock::ResponseTemplate; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::DatadogId; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; -#[derive(buildstructor::Builder)] -struct TraceSpec { - operation_name: Option, - version: Option, - services: HashSet<&'static str>, - span_names: HashSet<&'static str>, - measured_spans: HashSet<&'static str>, - unmeasured_spans: HashSet<&'static str>, -} - #[tokio::test(flavor = "multi_thread")] async fn test_no_sample() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let subgraph_was_sampled = std::sync::Arc::new(AtomicBool::new(false)); - let subgraph_was_sampled_callback = subgraph_was_sampled.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog_no_sample.router.yaml")) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + .build() + .await; + + router.start().await; + router.assert_started().await; + TraceSpec::builder() + .services(["router"].into()) + .subgraph_sampled(false) + .priority_sampled("0") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +// We want to check we're able to override the behavior of preview_datadog_agent_sampling configuration even if we set a datadog exporter +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services([].into()) + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + router.graceful_shutdown().await; + + Ok(()) +} + +// We want to check we're able to override the behavior of preview_datadog_agent_sampling configuration even if we set a datadog exporter +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled_always_sample() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled_1.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled_never_sample() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled_0.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services([].into()) + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!("fixtures/datadog.router.yaml")) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // Parent based sampling. psr MUST be populated with the value that we pass in. + TraceSpec::builder() + .services(["client", "router"].into()) + .subgraph_sampled(false) + .priority_sampled("-1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .subgraph_sampled(false) + .priority_sampled("0") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("2") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; + + // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated_otel_request() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { endpoint: None }) + .extra_propagator(Telemetry::Datadog) + .config(include_str!("fixtures/datadog.router.yaml")) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_no_parent_sampler.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router will ignore the upstream PSR as parent based sampling is disabled. + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; + + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_parent_sampler_very_small() -> Result<(), BoxError> { + // Note that there is a very small chance this test will fail. We are trying to test a non-zero sampler. + + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router should respect upstream but also almost never sample if left to its own devices. + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("-1") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("2") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_parent_sampler_very_small_no_parent() -> Result<(), BoxError> { + // Note that there is a very small chance this test will fail. We are trying to test a non-zero sampler. + + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml" )) - .subgraph_callback(Box::new(move || { - let sampled = Span::current().context().span().span_context().is_sampled(); - subgraph_was_sampled_callback.store(sampled, std::sync::atomic::Ordering::SeqCst); - })) .build() .await; router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_id, result) = router.execute_untraced_query(&query).await; + // // The router should respect upstream but also almost never sample if left to its own devices. + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("-1").traced(true).build()) + .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("0").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("1").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("2").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("2").traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + router.graceful_shutdown().await; - assert!(result.status().is_success()); - assert!(!subgraph_was_sampled.load(std::sync::atomic::Ordering::SeqCst)); Ok(()) } @@ -78,20 +466,9 @@ async fn test_default_span_names() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -109,8 +486,9 @@ async fn test_default_span_names() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -130,20 +508,9 @@ async fn test_override_span_names() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -161,8 +528,9 @@ async fn test_override_span_names() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -181,21 +549,9 @@ async fn test_override_span_names_late() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -213,8 +569,44 @@ async fn test_override_span_names_late() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_header_propagator_override() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_header_propagator_override.router.yaml" + )) + .build() + .await; + + let trace_id = opentelemetry::trace::TraceId::from_u128(uuid::Uuid::new_v4().as_u128()); + + router.start().await; + router.assert_started().await; + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .trace_id(format!("{:032x}", trace_id.to_datadog())) + .build() + .validate_datadog_trace( + &mut router, + Query::builder() + .header("trace-id", trace_id.to_string()) + .header("x-datadog-trace-id", "2") + .traced(false) + .build(), + ) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -232,20 +624,9 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") + .priority_sampled("1") .services(["client", "router", "subgraph"].into()) .span_names( [ @@ -276,8 +657,9 @@ async fn test_basic() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -295,23 +677,6 @@ async fn test_with_parent_span() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let mut headers = HashMap::new(); - headers.insert( - "traceparent".to_string(), - String::from("00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01"), - ); - let (id, result) = router.execute_query_with_headers(&query, headers).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -344,8 +709,18 @@ async fn test_with_parent_span() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace( + &mut router, + Query::builder() + .traced(true) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + ) + .build(), + ) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -365,13 +740,6 @@ async fn test_resource_mapping_default() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -391,7 +759,7 @@ async fn test_resource_mapping_default() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; router.graceful_shutdown().await; Ok(()) @@ -413,14 +781,6 @@ async fn test_resource_mapping_override() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .span_names( @@ -439,8 +799,9 @@ async fn test_resource_mapping_override() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -458,14 +819,6 @@ async fn test_span_metrics() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -486,59 +839,39 @@ async fn test_span_metrics() -> Result<(), BoxError> { .measured_span("subgraph") .unmeasured_span("supergraph") .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } -pub(crate) trait DatadogId { - fn to_datadog(&self) -> String; +struct DatadogTraceSpec { + trace_spec: TraceSpec, } -impl DatadogId for TraceId { - fn to_datadog(&self) -> String { - let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; - u64::from_be_bytes(bytes.try_into().unwrap()).to_string() +impl Deref for DatadogTraceSpec { + type Target = TraceSpec; + + fn deref(&self) -> &Self::Target { + &self.trace_spec } } -impl TraceSpec { - #[allow(clippy::too_many_arguments)] - async fn validate_trace(&self, id: TraceId) -> Result<(), BoxError> { - let datadog_id = id.to_datadog(); - let url = format!("http://localhost:8126/test/traces?trace_ids={datadog_id}"); - for _ in 0..10 { - if self.find_valid_trace(&url).await.is_ok() { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; - } - self.find_valid_trace(&url).await?; - Ok(()) +impl Verifier for DatadogTraceSpec { + fn spec(&self) -> &TraceSpec { + &self.trace_spec } - #[allow(clippy::too_many_arguments)] - async fn find_valid_trace(&self, url: &str) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - - // For now just validate service name. - let trace: Value = reqwest::get(url) + async fn get_trace(&self, trace_id: TraceId) -> Result { + let datadog_id = trace_id.to_datadog(); + let url = format!("http://localhost:8126/test/traces?trace_ids={datadog_id}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) .await .map_err(|e| anyhow!("failed to contact datadog; {}", e))? .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - self.verify_trace_participants(&trace)?; - self.verify_spans_present(&trace)?; - self.validate_measured_spans(&trace)?; - self.verify_operation_name(&trace)?; - self.verify_priority_sampled(&trace)?; - self.verify_version(&trace)?; - self.validate_span_kinds(&trace)?; - Ok(()) + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))?; + Ok(value) } fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { @@ -556,24 +889,6 @@ impl TraceSpec { Ok(()) } - fn validate_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { - for expected in &self.measured_spans { - assert!( - self.measured_span(trace, expected)?, - "missing measured span {}", - expected - ); - } - for unexpected in &self.unmeasured_spans { - assert!( - !self.measured_span(trace, unexpected)?, - "unexpected measured span {}", - unexpected - ); - } - Ok(()) - } - fn measured_span(&self, trace: &Value, name: &str) -> Result { let binding1 = trace.select_path(&format!( "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", @@ -591,15 +906,7 @@ impl TraceSpec { .unwrap_or_default()) } - fn validate_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { - // Validate that the span.kind has been propagated. We can just do this for a selection of spans. - self.validate_span_kind(trace, "router", "server")?; - self.validate_span_kind(trace, "supergraph", "internal")?; - self.validate_span_kind(trace, "http_request", "client")?; - Ok(()) - } - - fn verify_trace_participants(&self, trace: &Value) -> Result<(), BoxError> { + fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { let actual_services: HashSet = trace .select_path("$..service")? .into_iter() @@ -627,7 +934,7 @@ impl TraceSpec { .filter_map(|span_name| span_name.as_string()) .collect(); let mut span_names: HashSet<&str> = self.span_names.clone(); - if self.services.contains("client") { + if self.services.contains(&"client") { span_names.insert("client_request"); } tracing::debug!("found spans {:?}", operation_names); @@ -652,19 +959,24 @@ impl TraceSpec { trace.select_path(&format!("$..[?(@.name == '{}')].meta.['span.kind']", name))?; let binding = binding1.first().or(binding2.first()); - assert!( - binding.is_some(), - "span.kind missing or incorrect {}, {}", - name, - trace - ); - assert_eq!( - binding - .expect("expected binding") - .as_str() - .expect("expected string"), - kind - ); + if binding.is_none() { + return Err(BoxError::from(format!( + "span.kind missing or incorrect {}, {}", + name, trace + ))); + } + + let binding = binding + .expect("expected binding") + .as_str() + .expect("expected string"); + if binding != kind { + return Err(BoxError::from(format!( + "span.kind mismatch, expected {} got {}", + kind, binding + ))); + } + Ok(()) } @@ -685,17 +997,39 @@ impl TraceSpec { } fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { - let binding = trace.select_path("$.._sampling_priority_v1")?; - let sampling_priority = binding.first(); - // having this priority set to 1.0 everytime is not a problem as we're doing pre sampling in the full telemetry stack - // So basically if the trace was not sampled it wouldn't get to this stage and so nothing would be sent - assert_eq!( - sampling_priority - .expect("sampling priority expected") - .as_f64() - .expect("sampling priority must be a number"), - 1.0 - ); + if let Some(psr) = self.priority_sampled { + let binding = + trace.select_path("$..[?(@.service=='router')].metrics._sampling_priority_v1")?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); + } + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_f64() + .expect("psr not string") + .to_string(), + psr, + "psr mismatch" + ); + } + } + Ok(()) + } + + fn verify_span_attributes(&self, _trace: &Value) -> Result<(), BoxError> { Ok(()) } } + +impl TraceSpec { + async fn validate_datadog_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + DatadogTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await + } +} diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml index d6ecc66607..c1c4b2096e 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml @@ -5,14 +5,12 @@ telemetry: enabled: true header_name: apollo-custom-trace-id format: datadog - propagation: - trace_context: true - jaeger: true common: service_name: router resource: env: local1 service.version: router_version_override + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml new file mode 100644 index 0000000000..49b1528c94 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml @@ -0,0 +1,23 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + # NOT always_off to allow us to test a sampling probability of zero + sampler: 0.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml new file mode 100644 index 0000000000..42f56dd642 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml @@ -0,0 +1,22 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + sampler: 0.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml new file mode 100644 index 0000000000..2334508de4 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml @@ -0,0 +1,22 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + sampler: 1.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml index 67c2c070e6..e874c00fab 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml new file mode 100644 index 0000000000..595639f1ff --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml @@ -0,0 +1,29 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + propagation: + datadog: true + request: + header_name: trace-id + common: + service_name: router + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..c6ec7c22b7 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml @@ -0,0 +1,25 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + common: + service_name: router + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml index d89d104346..19af041c56 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml @@ -11,6 +11,7 @@ telemetry: service_name: router # NOT always_off to allow us to test a sampling probability of zero sampler: 0.0 + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml index 7d5e1ff2e1..bb793301d0 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true # Span mapping will always override the span name as far as the test agent is concerned diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml index dda383a784..821662b5be 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true # Span mapping will always override the span name as far as the test agent is concerned diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml new file mode 100644 index 0000000000..206e72d1b1 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml @@ -0,0 +1,26 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + common: + service_name: router + sampler: 0.00001 + parent_based_sampler: true + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml new file mode 100644 index 0000000000..658b7d2361 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml @@ -0,0 +1,25 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + common: + service_name: router + sampler: 0.00001 + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml index 96160b1831..0603e72c9c 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true enable_span_mapping: true diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml index a01c44fc61..5eba22068b 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true enable_span_mapping: true diff --git a/apollo-router/tests/integration/telemetry/fixtures/jaeger-advanced.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/jaeger-advanced.router.yaml index bb377026d7..c07050677a 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/jaeger-advanced.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/jaeger-advanced.router.yaml @@ -42,8 +42,8 @@ telemetry: request_header: "x-my-header" condition: eq: - - request_header: "head" - - "test" + - request_header: "x-my-header-condition" + - "condition" studio.operation.id: studio_operation_id: true supergraph: diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml index f4484786f4..aa56c66187 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml @@ -9,7 +9,7 @@ telemetry: otlp: enabled: true protocol: http - endpoint: /traces + endpoint: batch_processor: scheduled_delay: 10ms metrics: @@ -22,3 +22,15 @@ telemetry: batch_processor: scheduled_delay: 10ms + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml new file mode 100644 index 0000000000..77529f500d --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 0.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml new file mode 100644 index 0000000000..6b1f32f71f --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 1.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml new file mode 100644 index 0000000000..77529f500d --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 0.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml new file mode 100644 index 0000000000..7352f3d620 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml @@ -0,0 +1,39 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml new file mode 100644 index 0000000000..08323073f3 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml @@ -0,0 +1,38 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..7fd47f096b --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml @@ -0,0 +1,40 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + parent_based_sampler: false + preview_datadog_agent_sampling: true + service_name: router + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml new file mode 100644 index 0000000000..3bcb4e5db5 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml @@ -0,0 +1,41 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + zipkin: true + datadog: true + trace_context: true + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 1.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..5fdf22e0d6 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml @@ -0,0 +1,25 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + parent_based_sampler: false + otlp: + enabled: true + protocol: http + endpoint: /traces + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + diff --git a/apollo-router/tests/integration/telemetry/jaeger.rs b/apollo-router/tests/integration/telemetry/jaeger.rs index 7d9dc1bf46..8c38c59ec2 100644 --- a/apollo-router/tests/integration/telemetry/jaeger.rs +++ b/apollo-router/tests/integration/telemetry/jaeger.rs @@ -1,7 +1,7 @@ extern crate core; use std::collections::HashSet; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; use opentelemetry_api::trace::TraceId; @@ -9,7 +9,10 @@ use serde_json::json; use serde_json::Value; use tower::BoxError; +use crate::integration::common::Query; use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; @@ -24,22 +27,13 @@ async fn test_reload() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.touch_config().await; router.assert_reloaded().await; } @@ -58,21 +52,11 @@ async fn test_remote_root() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.graceful_shutdown().await; Ok(()) @@ -89,21 +73,12 @@ async fn test_local_root() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_untraced_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; Ok(()) @@ -120,8 +95,9 @@ async fn test_local_root_no_sample() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query).await; + let (_, response) = router + .execute_query(Query::builder().traced(false).build()) + .await; assert!(response.headers().get("apollo-custom-trace-id").is_some()); router.graceful_shutdown().await; @@ -138,19 +114,13 @@ async fn test_local_root_50_percent_sample() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}\n","variables":{}, "operationName": "ExampleQuery"}); for _ in 0..100 { - let (id, result) = router.execute_untraced_query(&query).await; - - if result.headers().get("apollo-custom-trace-id").is_some() - && validate_trace( - id, - &query, - Some("ExampleQuery"), - &["router", "subgraph"], - false, - ) + if TraceSpec::builder() + .services(["router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) .await .is_ok() { @@ -176,10 +146,11 @@ async fn test_no_telemetry() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query).await; - assert!(response.headers().get("apollo-custom-trace-id").is_none()); - + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -194,22 +165,11 @@ async fn test_default_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}","variables":{}}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery1"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -225,15 +185,11 @@ async fn test_anonymous_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query {topProducts{name}}","variables":{}}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace(id, &query, None, &["client", "router", "subgraph"], false).await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::builder().build()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -248,28 +204,21 @@ async fn test_selected_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}\nquery ExampleQuery2 {topProducts{name}}","variables":{}, "operationName": "ExampleQuery2"}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery2"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder().services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery2") + .build() + .validate_jaeger_trace( + &mut router, + Query::builder() + .body(json!({"query":"query ExampleQuery1 {topProducts{name}}\nquery ExampleQuery2 {topProducts{name}}","variables":{}, "operationName": "ExampleQuery2"}) + ).build(), + ).await?; router.graceful_shutdown().await; Ok(()) } #[tokio::test(flavor = "multi_thread")] -async fn test_span_customization() -> Result<(), BoxError> { +async fn test_span_attributes() -> Result<(), BoxError> { if std::env::var("TEST_APOLLO_KEY").is_ok() && std::env::var("TEST_APOLLO_GRAPH_REF").is_ok() { let mut router = IntegrationTest::builder() .telemetry(Telemetry::Jaeger) @@ -280,16 +229,75 @@ async fn test_span_customization() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _res) = router.execute_query(&query).await; - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - true, - ) - .await?; + // attributes: + // http.request.method: true + // http.response.status_code: true + // url.path: true + // "http.request.header.x-my-header": + // request_header: "x-my-header" + // "http.request.header.x-not-present": + // request_header: "x-not-present" + // default: nope + // "http.request.header.x-my-header-condition": + // request_header: "x-my-header" + // condition: + // eq: + // - request_header: "head" + // - "test" + // studio.operation.id: + // studio_operation_id: true + // supergraph: + // attributes: + // graphql.operation.name: true + // graphql.operation.type: true + // graphql.document: true + // subgraph: + // attributes: + // subgraph.graphql.operation.type: true + // subgraph.name: true + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .span_attribute( + "router", + [ + ("http.request.method", "POST"), + ("http.response.status_code", "200"), + ("url.path", "/"), + ("http.request.header.x-my-header", "test"), + ("http.request.header.x-not-present", "nope"), + ("http.request.header.x-my-header-condition", "test"), + ("studio.operation.id", "*"), + ] + .into(), + ) + .span_attribute( + "supergraph", + [ + ("graphql.operation.name", "ExampleQuery"), + ("graphql.operation.type", "query"), + ("graphql.document", "query ExampleQuery {topProducts{name}}"), + ] + .into(), + ) + .span_attribute( + "subgraph", + [ + ("subgraph.graphql.operation.type", "query"), + ("subgraph.name", "products"), + ] + .into(), + ) + .build() + .validate_jaeger_trace( + &mut router, + Query::builder() + .header("x-my-header", "test") + .header("x-my-header-condition", "condition") + .build(), + ) + .await?; router.graceful_shutdown().await; } Ok(()) @@ -305,9 +313,8 @@ async fn test_decimal_trace_id() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; + let (id, result) = router.execute_query(Query::default()).await; let id_from_router: u128 = result .headers() .get("apollo-custom-trace-id") @@ -317,341 +324,208 @@ async fn test_decimal_trace_id() -> Result<(), BoxError> { .parse() .expect("expected decimal trace ID"); assert_eq!(format!("{:x}", id_from_router), id.to_string()); - - validate_trace( - id, - &query, - Some("ExampleQuery1"), - &["client", "router", "subgraph"], - false, - ) - .await?; router.graceful_shutdown().await; Ok(()) } -async fn validate_trace( - id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let params = url::form_urlencoded::Serializer::new(String::new()) - .append_pair("service", services.first().expect("expected root service")) - .finish(); - - let id = id.to_string(); - let url = format!("http://localhost:16686/api/traces/{id}?{params}"); - for _ in 0..10 { - if find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await - .is_ok() - { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(1000)).await; - } - find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await?; - Ok(()) +struct JaegerTraceSpec { + trace_spec: TraceSpec, } +impl Deref for JaegerTraceSpec { + type Target = TraceSpec; -async fn find_valid_trace( - url: &str, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - let trace: Value = reqwest::get(url) - .await - .map_err(|e| anyhow!("failed to contact jaeger; {}", e))? - .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - - // Verify that we got all the participants in the trace - verify_trace_participants(&trace, services)?; - - // Verify that we got the expected span operation names - verify_spans_present(&trace, operation_name, services)?; - - // Verify that all spans have a path to the root 'client_request' span - verify_span_parenting(&trace, services)?; - - // Verify that root span fields are present - verify_root_span_fields(&trace, operation_name)?; - - // Verify that supergraph span fields are present - verify_supergraph_span_fields(&trace, query, operation_name, custom_span_instrumentation)?; - - // Verify that router span fields are present - verify_router_span_fields(&trace, custom_span_instrumentation)?; - - Ok(()) + fn deref(&self) -> &Self::Target { + &self.trace_spec + } } -fn verify_router_span_fields( - trace: &Value, - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let router_span = trace.select_path("$..spans[?(@.operationName == 'router')]")?[0]; - // We can't actually assert the values on a span. Only that a field has been set. - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'client.name')].value")? - .first(), - Some(&&Value::String("custom_name".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'client.version')].value")? - .first(), - Some(&&Value::String("1.0".to_string())) - ); - assert!(router_span - .select_path("$.logs[*].fields[?(@.key == 'histogram.apollo_router_span')].value")? - .is_empty(),); - assert!(router_span - .select_path("$.logs[*].fields[?(@.key == 'histogram.apollo_router_span')].value")? - .is_empty(),); - if custom_span_instrumentation { - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'http.request.method')].value")? - .first(), - Some(&&Value::String("POST".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'http.request.header.x-not-present')].value")? - .first(), - Some(&&Value::String("nope".to_string())) - ); - assert_eq!( - router_span - .select_path( - "$.tags[?(@.key == 'http.request.header.x-my-header-condition')].value" - )? - .first(), - Some(&&Value::String("test".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'studio.operation.id')].value")? - .first(), - Some(&&Value::String( - "f60e643d7f52ecda23216f86409d7e2e5c3aa68c".to_string() - )) - ); +impl Verifier for JaegerTraceSpec { + fn spec(&self) -> &TraceSpec { + &self.trace_spec } - Ok(()) -} - -fn verify_root_span_fields(trace: &Value, operation_name: Option<&str>) -> Result<(), BoxError> { - // We can't actually assert the values on a span. Only that a field has been set. - let root_span_name = operation_name - .map(|name| format!("query {}", name)) - .unwrap_or("query".to_string()); - let request_span = trace.select_path(&format!( - "$..spans[?(@.operationName == '{root_span_name}')]" - ))?[0]; - - if let Some(operation_name) = operation_name { - assert_eq!( - request_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .first(), - Some(&&Value::String(operation_name.to_string())) - ); - } else { - assert!(request_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .is_empty(),); + fn verify_span_attributes(&self, trace: &Value) -> Result<(), BoxError> { + for (span, attributes) in &self.span_attributes { + for (key, value) in attributes { + let binding = trace.select_path(&format!( + "$..spans[?(@.operationName == '{span}')]..tags..[?(@.key == '{key}')].value" + ))?; + + let actual_value = binding + .first() + .unwrap_or_else(|| panic!("could not find attribute {key} on {span}")); + match actual_value { + Value::String(_) if *value == "*" => continue, + Value::String(s) => { + assert_eq!(s, value, "unexpected attribute {key} on {span}") + } + Value::Number(_) if *value == "*" => continue, + Value::Number(n) => assert_eq!( + n.to_string(), + *value, + "unexpected attribute {key} on {span}" + ), + _ => panic!("unexpected value type"), + } + } + } + Ok(()) } - assert_eq!( - request_span - .select_path("$.tags[?(@.key == 'graphql.operation.type')].value")? - .first(), - Some(&&Value::String("query".to_string())) - ); + async fn get_trace(&self, trace_id: TraceId) -> Result { + let params = url::form_urlencoded::Serializer::new(String::new()) + .append_pair( + "service", + self.trace_spec + .services + .first() + .expect("expected root service"), + ) + .finish(); - Ok(()) -} + let id = trace_id.to_string(); + let url = format!("http://localhost:16686/api/traces/{id}?{params}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) + .await + .map_err(|e| anyhow!("failed to contact jaeger; {}", e))? + .json() + .await + .map_err(|e| anyhow!("failed to contact jaeger; {}", e))?; -fn verify_supergraph_span_fields( - trace: &Value, - query: &Value, - operation_name: Option<&str>, - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // We can't actually assert the values on a span. Only that a field has been set. - let supergraph_span = trace.select_path("$..spans[?(@.operationName == 'supergraph')]")?[0]; - - if let Some(operation_name) = operation_name { - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .first(), - Some(&&Value::String(operation_name.to_string())) - ); - } else { - assert!(supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .is_empty(),); - } - if custom_span_instrumentation { - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.type')].value")? - .first(), - Some(&&Value::String("query".to_string())) - ); + Ok(value) } - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.document')].value")? - .first(), - Some(&&Value::String( - query - .as_object() - .expect("should have been an object") - .get("query") - .expect("must have a query") - .as_str() - .expect("must be a string") - .to_string() - )) - ); + fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_version) = &self.version { + let binding = trace.select_path("$..version")?; + let version = binding.first(); + assert_eq!( + version + .expect("version expected") + .as_str() + .expect("version must be a string"), + expected_version + ); + } + Ok(()) + } - Ok(()) -} + fn measured_span(&self, trace: &Value, name: &str) -> Result { + let binding1 = trace.select_path(&format!( + "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", + name + ))?; + let binding2 = trace.select_path(&format!( + "$..[?(@.name == '{}')].metrics.['_dd.measured']", + name + ))?; + Ok(binding1 + .first() + .or(binding2.first()) + .and_then(|v| v.as_f64()) + .map(|v| v == 1.0) + .unwrap_or_default()) + } -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..serviceName")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); + fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { + let actual_services: HashSet = trace + .select_path("$..serviceName")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + + let expected_services = self + .trace_spec + .services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) } - Ok(()) -} -fn verify_spans_present( - trace: &Value, - operation_name: Option<&str>, - services: &[&'static str], -) -> Result<(), BoxError> { - let operation_names: HashSet = trace - .select_path("$..operationName")? - .into_iter() - .filter_map(|span_name| span_name.as_string()) - .collect(); - let mut expected_operation_names: HashSet = HashSet::from( - [ - "execution", - "subgraph server", - operation_name - .map(|name| format!("query {name}")) - .unwrap_or("query".to_string()) - .as_str(), - "supergraph", - "fetch", - //"parse_query", Parse query will only happen once - //"query_planning", query planning will only happen once - "subgraph", - ] - .map(|s| s.into()), - ); - if services.contains(&"client") { - expected_operation_names.insert("client_request".into()); + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError> { + let operation_names: HashSet = trace + .select_path("$..operationName")? + .into_iter() + .filter_map(|span_name| span_name.as_string()) + .collect(); + + let mut span_names: HashSet<&str> = self.span_names.clone(); + if self.services.contains(&"client") { + span_names.insert("client_request"); + } + tracing::debug!("found spans {:?}", operation_names); + let missing_operation_names: Vec<_> = span_names + .iter() + .filter(|o| !operation_names.contains(**o)) + .collect(); + if !missing_operation_names.is_empty() { + return Err(BoxError::from(format!( + "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" + ))); + } + Ok(()) } - tracing::debug!("found spans {:?}", operation_names); - let missing_operation_names: Vec<_> = expected_operation_names - .iter() - .filter(|o| !operation_names.contains(*o)) - .collect(); - if !missing_operation_names.is_empty() { - return Err(BoxError::from(format!( - "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" - ))); + + fn validate_span_kind(&self, _trace: &Value, _name: &str, _kind: &str) -> Result<(), BoxError> { + Ok(()) } - Ok(()) -} -fn verify_span_parenting(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let root_span = if services.contains(&"client") { - trace.select_path("$..spans[?(@.operationName == 'client_request')]")?[0] - } else { - trace.select_path("$..spans[?(@.operationName == 'query ExampleQuery')]")?[0] - }; - let spans = trace.select_path("$..spans[*]")?; - for span in spans { - let mut span_path = vec![span.select_path("$.operationName")?[0] - .as_str() - .expect("operation name not not found")]; - let mut current = span; - while let Some(parent) = parent_span(trace, current) { - span_path.push( - parent.select_path("$.operationName")?[0] + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = + trace.select_path("$..spans[?(@.operationName == 'supergraph')]..tags[?(@.key == 'graphql.operation.name')].value")?; + let operation_name = binding.first(); + if operation_name.is_none() { + return Err(BoxError::from("graphql.operation.name not found")); + } + assert_eq!( + operation_name + .expect("graphql.operation.name expected") .as_str() - .expect("operation name not not found"), + .expect("graphql.operation.name must be a string"), + expected_operation_name ); - current = parent; } - tracing::debug!("span path to root: '{:?}'", span_path); - if current != root_span { - return Err(BoxError::from(format!( - "span {:?} did not have a path to the root span", - span.select_path("$.operationName")?, - ))); + Ok(()) + } + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(psr) = self.priority_sampled { + let binding = + trace.select_path("$..[?(@.service=='router')].metrics._sampling_priority_v1")?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); + } + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_f64() + .expect("psr not string") + .to_string(), + psr + ); + } } + Ok(()) } - Ok(()) } -fn parent_span<'a>(trace: &'a Value, span: &'a Value) -> Option<&'a Value> { - span.select_path("$.references[?(@.refType == 'CHILD_OF')].spanID") - .ok()? - .into_iter() - .filter_map(|id| id.as_str()) - .filter_map(|id| { - trace - .select_path(&format!("$..spans[?(@.spanID == '{id}')]")) - .ok()? - .into_iter() - .next() - }) - .next() +impl TraceSpec { + async fn validate_jaeger_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + JaegerTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await + } } diff --git a/apollo-router/tests/integration/telemetry/logging.rs b/apollo-router/tests/integration/telemetry/logging.rs index 9e41160572..59dc1c7ccd 100644 --- a/apollo-router/tests/integration/telemetry/logging.rs +++ b/apollo-router/tests/integration/telemetry/logging.rs @@ -1,9 +1,9 @@ -use serde_json::json; use tower::BoxError; use uuid::Uuid; use crate::integration::common::graph_os_enabled; use crate::integration::common::IntegrationTest; +use crate::integration::common::Query; use crate::integration::common::Telemetry; #[tokio::test(flavor = "multi_thread")] @@ -22,16 +22,15 @@ async fn test_json() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; #[cfg(unix)] { - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains( r#""schema.id":"dd8960ccefda82ca58e8ac0bc266459fd49ee8215fd6b3cc72e7bc3d7f3464b9""#, @@ -39,11 +38,11 @@ async fn test_json() -> Result<(), BoxError> { .await; } - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""on_supergraph_response_event":"on_supergraph_event""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; router.graceful_shutdown().await; @@ -66,24 +65,23 @@ async fn test_json_promote_span_attributes() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""too_big":true"#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""too_big":"nope""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""graphql.document":"query ExampleQuery {topProducts{name}}""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_not_contains(r#""should_not_log""#).await; router.assert_log_not_contains(r#""another_one""#).await; router.graceful_shutdown().await; @@ -107,14 +105,13 @@ async fn test_json_uuid_format() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - let (trace_id, _) = router.execute_query(&query).await; + let (trace_id, _) = router.execute_default_query().await; router .assert_log_contains(&format!("{}", Uuid::from_bytes(trace_id.to_bytes()))) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; router.graceful_shutdown().await; @@ -137,14 +134,13 @@ async fn test_text_uuid_format() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - let (trace_id, _) = router.execute_query(&query).await; + let (trace_id, _) = router.execute_default_query().await; router .assert_log_contains(&format!("{}", Uuid::from_bytes(trace_id.to_bytes()))) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; router.graceful_shutdown().await; @@ -166,18 +162,17 @@ async fn test_json_sampler_off() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""on_supergraph_response_event":"on_supergraph_event""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; router.graceful_shutdown().await; @@ -200,17 +195,16 @@ async fn test_text() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; + router.execute_query(Query::default()).await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; router.assert_log_contains("span_id").await; router .assert_log_contains(r#"on_supergraph_response_event=on_supergraph_event"#) .await; - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; + router.execute_query(Query::default()).await; router.assert_log_contains("response_status=200").await; router.graceful_shutdown().await; Ok(()) @@ -231,14 +225,12 @@ async fn test_text_sampler_off() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_default_query().await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("response_status=200").await; router.graceful_shutdown().await; Ok(()) diff --git a/apollo-router/tests/integration/telemetry/metrics.rs b/apollo-router/tests/integration/telemetry/metrics.rs index cd9c1c4550..56a5d6223d 100644 --- a/apollo-router/tests/integration/telemetry/metrics.rs +++ b/apollo-router/tests/integration/telemetry/metrics.rs @@ -3,6 +3,7 @@ use std::time::Duration; use serde_json::json; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; const PROMETHEUS_CONFIG: &str = include_str!("fixtures/prometheus.router.yaml"); @@ -106,9 +107,7 @@ async fn test_subgraph_auth_metrics() { router.update_config(PROMETHEUS_CONFIG).await; router.assert_reloaded().await; // This one will not be signed, counters shouldn't increment. - router - .execute_query(&json! {{ "query": "query { me { name } }"}}) - .await; + router.execute_query(Query::default()).await; // Get Prometheus metrics. let metrics_response = router.get_metrics_response().await.unwrap(); @@ -137,7 +136,9 @@ async fn test_metrics_bad_query() { router.start().await; router.assert_started().await; // This query won't make it to the supergraph service - router.execute_bad_query().await; + router + .execute_query(Query::default().with_bad_query()) + .await; router.assert_metrics_contains(r#"apollo_router_operations_total{http_response_status_code="400",otel_scope_name="apollo/router"} 1"#, None).await; } @@ -157,7 +158,9 @@ async fn test_bad_queries() { None, ) .await; - router.execute_bad_content_type().await; + router + .execute_query(Query::default().with_bad_content_type()) + .await; router .assert_metrics_contains( @@ -166,7 +169,9 @@ async fn test_bad_queries() { ) .await; - router.execute_bad_query().await; + router + .execute_query(Query::default().with_bad_query()) + .await; router .assert_metrics_contains( r#"apollo_router_http_requests_total{error="Must provide query string",status="400",otel_scope_name="apollo/router"}"#, @@ -174,7 +179,9 @@ async fn test_bad_queries() { ) .await; - router.execute_huge_query().await; + router + .execute_query(Query::default().with_huge_query()) + .await; router .assert_metrics_contains( r#"apollo_router_http_requests_total{error="Request body payload too large",status="413",otel_scope_name="apollo/router"} 1"#, @@ -259,14 +266,12 @@ async fn test_gauges_on_reload() { router.execute_default_query().await; // Introspection query - router - .execute_query(&json!({"query":"{__schema {types {name}}}","variables":{}})) - .await; + router.execute_query(Query::introspection()).await; // Persisted query router .execute_query( - &json!({"query": "{__typename}", "variables":{}, "extensions": {"persistedQuery":{"version" : 1, "sha256Hash" : "ecf4edb46db40b5132295c0291d62fb65d6759a9eedfa4d5d612dd5ec54a6b38"}}}) + Query::builder().body(json!({"query": "{__typename}", "variables":{}, "extensions": {"persistedQuery":{"version" : 1, "sha256Hash" : "ecf4edb46db40b5132295c0291d62fb65d6759a9eedfa4d5d612dd5ec54a6b38"}}})).build() ) .await; diff --git a/apollo-router/tests/integration/telemetry/mod.rs b/apollo-router/tests/integration/telemetry/mod.rs index 8df0a1d753..6319182e62 100644 --- a/apollo-router/tests/integration/telemetry/mod.rs +++ b/apollo-router/tests/integration/telemetry/mod.rs @@ -1,3 +1,8 @@ +use std::collections::HashMap; +use std::collections::HashSet; + +use opentelemetry_api::trace::TraceId; + #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod datadog; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] @@ -6,5 +11,61 @@ mod logging; mod metrics; mod otlp; mod propagation; +mod verifier; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod zipkin; + +struct TraceSpec { + operation_name: Option, + version: Option, + services: Vec<&'static str>, + span_names: HashSet<&'static str>, + measured_spans: HashSet<&'static str>, + unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, + subgraph_sampled: Option, + trace_id: Option, + span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, +} + +#[buildstructor::buildstructor] +impl TraceSpec { + #[allow(clippy::too_many_arguments)] + #[builder] + pub fn new( + operation_name: Option, + version: Option, + services: Vec<&'static str>, + span_names: HashSet<&'static str>, + measured_spans: HashSet<&'static str>, + unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, + subgraph_sampled: Option, + trace_id: Option, + span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, + ) -> Self { + Self { + operation_name, + version, + services, + span_names, + measured_spans, + unmeasured_spans, + priority_sampled, + subgraph_sampled, + span_attributes, + trace_id, + } + } +} + +#[allow(dead_code)] +pub trait DatadogId { + fn to_datadog(&self) -> u64; +} +impl DatadogId for TraceId { + fn to_datadog(&self) -> u64 { + let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; + u64::from_be_bytes(bytes.try_into().unwrap()) + } +} diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 7eae04f567..af73bc32e8 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -1,15 +1,13 @@ extern crate core; use std::collections::HashSet; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; -use itertools::Itertools; use opentelemetry_api::trace::TraceId; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceResponse; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceResponse; use prost::Message; -use serde_json::json; use serde_json::Value; use tower::BoxError; use wiremock::matchers::method; @@ -18,37 +16,110 @@ use wiremock::Mock; use wiremock::MockServer; use wiremock::ResponseTemplate; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::DatadogId; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; #[tokio::test(flavor = "multi_thread")] async fn test_basic() -> Result<(), BoxError> { - let mock_server = wiremock::MockServer::start().await; - Mock::given(method("POST")) - .and(path("/traces")) - .respond_with(ResponseTemplate::new(200).set_body_raw( - ExportTraceServiceResponse::default().encode_to_vec(), - "application/x-protobuf", - )) - .expect(1..) - .mount(&mock_server) + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .config(&config) + .build() .await; - Mock::given(method("POST")) - .and(path("/metrics")) - .respond_with(ResponseTemplate::new(200).set_body_raw( - ExportMetricsServiceResponse::default().encode_to_vec(), - "application/x-protobuf", - )) - .expect(1..) - .mount(&mock_server) + + router.start().await; + router.assert_started().await; + + for _ in 0..2 { + TraceSpec::builder() + .operation_name("ExampleQuery") + .services(["client", "router", "subgraph"].into()) + .span_names( + [ + "query_planning", + "client_request", + "ExampleQuery__products__0", + "fetch", + "execution", + "query ExampleQuery", + "subgraph server", + "parse_query", + "http_request", + ] + .into(), + ) + .subgraph_sampled(true) + .build() + .validate_otlp_trace(&mut router, &mock_server, Query::default()) + .await?; + TraceSpec::builder() + .service("router") + .build() + .validate_otlp_metrics(&mock_server) + .await?; + router.touch_config().await; + router.assert_reloaded().await; + } + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .config(&config) + .build() .await; - let config = include_str!("fixtures/otlp.router.yaml") + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace(&mut router, &mock_server, Query::default()) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_datadog_propagator_no_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation_no_agent.router.yaml") .replace("", &mock_server.uri()); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { - endpoint: format!("{}/traces", mock_server.uri()), + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) .config(&config) .build() @@ -57,170 +128,668 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_telemetry( + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, &mock_server, - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, + Query::builder().traced(true).build(), ) .await?; - router.touch_config().await; - router.assert_reloaded().await; - } router.graceful_shutdown().await; Ok(()) } -async fn validate_telemetry( - mock_server: &MockServer, - _id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( ) -> Result<(), BoxError> { - for _ in 0..10 { - let trace_valid = find_valid_trace( - mock_server, - query, - operation_name, - services, - custom_span_instrumentation, + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .config(&config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), ) + .await?; + // ---------------------- zipkin propagator with unsampled trace + // Testing for an unsampled trace, so it should be sent to the otlp exporter with sampling priority set 0 + // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header("X-B3-TraceId", "80f198ee56343ba864fe8b2a57d3eff7") + .header("X-B3-ParentSpanId", "05e3ac9a4f6e3b90") + .header("X-B3-SpanId", "e457b5a2e4d86bd1") + .header("X-B3-Sampled", "0") + .build(), + ) + .await?; + // ---------------------- trace context propagation + // Testing for a trace containing the right tracestate with m and psr for DD and a sampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 + // And it should also send the trace to subgraph as the trace is sampled + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(true) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + ) + .header("tracestate", "m=1,psr=1") + .build(), + ) + .await?; + // ---------------------- + // Testing for a trace containing the right tracestate with m and psr for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 0 + // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-02", + ) + .header("tracestate", "m=1,psr=0") + .build(), + ) + .await?; + // ---------------------- + // Testing for a trace containing a tracestate m and psr with psr set to 1 for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 + // It should not send the trace to the subgraph as we didn't use the datadog propagator and therefore the trace will remain unsampled. + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-03", + ) + .header("tracestate", "m=1,psr=1") + .build(), + ) + .await?; + + // Be careful if you add the same kind of test crafting your own trace id, make sure to increment the previous trace id by 1 if not you'll receive all the previous spans tested with the same trace id before + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_no_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .config(&config) + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .build() .await; - let metrics_valid = find_valid_metrics(mock_server, query, operation_name, services).await; + router.start().await; + router.assert_started().await; - if metrics_valid.is_ok() && trace_valid.is_ok() { - return Ok(()); - } + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) + .await?; + router.graceful_shutdown().await; + Ok(()) +} - tokio::time::sleep(Duration::from_millis(100)).await; +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); } - find_valid_trace( - mock_server, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await?; - find_valid_metrics(mock_server, query, operation_name, services).await?; + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .config(&config) + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .build() + .await; + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) + .await?; + router.graceful_shutdown().await; Ok(()) } -async fn find_valid_trace( - mock_server: &MockServer, - _query: &Value, - _operation_name: Option<&str>, - services: &[&'static str], - _custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let requests = mock_server - .received_requests() - .await - .expect("Could not get otlp requests"); - - // A valid trace has: - // * A valid service name - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - let traces: Vec<_>= requests - .iter() - .filter_map(|r| { - if r.url.path().ends_with("/traces") { +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_sample_no_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .config(&config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + // We're using datadog propagation as this is what we are trying to test. + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .config(config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // Parent based sampling. psr MUST be populated with the value that we pass in. + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("-1") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("-1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("0").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("2") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("2").build(), + ) + .await?; + + // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .config(config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router will ignore the upstream PSR as parent based sampling is disabled. + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("-1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("0").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("2").build(), + ) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +struct OtlpTraceSpec<'a> { + trace_spec: TraceSpec, + mock_server: &'a MockServer, +} +impl Deref for OtlpTraceSpec<'_> { + type Target = TraceSpec; + + fn deref(&self) -> &Self::Target { + &self.trace_spec + } +} + +impl Verifier for OtlpTraceSpec<'_> { + fn verify_span_attributes(&self, _span: &Value) -> Result<(), BoxError> { + // TODO + Ok(()) + } + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } + + fn measured_span(&self, trace: &Value, name: &str) -> Result { + let binding1 = trace.select_path(&format!( + "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", + name + ))?; + let binding2 = trace.select_path(&format!( + "$..[?(@.name == '{}')].metrics.['_dd.measured']", + name + ))?; + Ok(binding1 + .first() + .or(binding2.first()) + .and_then(|v| v.as_f64()) + .map(|v| v == 1.0) + .unwrap_or_default()) + } + + async fn find_valid_metrics(&self) -> Result<(), BoxError> { + let requests = self + .mock_server + .received_requests() + .await + .expect("Could not get otlp requests"); + if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { + let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; + let json_metrics = serde_json::to_value(metrics)?; + // For now just validate service name. + self.verify_services(&json_metrics)?; + + Ok(()) + } else { + Err(anyhow!("No metrics received").into()) + } + } + + async fn get_trace(&self, trace_id: TraceId) -> Result { + let requests = self.mock_server.received_requests().await; + let trace = Value::Array(requests.unwrap_or_default().iter().filter(|r| r.url.path().ends_with("/traces")) + .filter_map(|r| { match opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest::decode( bytes::Bytes::copy_from_slice(&r.body), ) { Ok(trace) => { match serde_json::to_value(trace) { - Ok(trace) => { Some(Ok(trace)) } - Err(e) => { - Some(Err(BoxError::from(format!("failed to decode trace: {}", e)))) + Ok(trace) => { + Some(trace) + } + Err(_) => { + None } } } - Err(e) => { - Some(Err(BoxError::from(format!("failed to decode trace: {}", e)))) + Err(_) => { + None } } - } - else { - None - } - }) - .try_collect()?; - if !traces.is_empty() { - let json_trace = serde_json::Value::Array(traces); - verify_trace_participants(&json_trace, services)?; + }).filter(|t| { + let datadog_trace_id = TraceId::from_u128(trace_id.to_datadog() as u128); + let trace_found1 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", trace_id)).unwrap_or_default().is_empty(); + let trace_found2 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", datadog_trace_id)).unwrap_or_default().is_empty(); + trace_found1 | trace_found2 + }).collect()); + Ok(trace) + } + fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_version) = &self.version { + let binding = trace.select_path("$..version")?; + let version = binding.first(); + assert_eq!( + version + .expect("version expected") + .as_str() + .expect("version must be a string"), + expected_version + ); + } Ok(()) - } else { - Err(anyhow!("No traces received").into()) } -} -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..resource.attributes[?(@.key=='service.name')].value.stringValue")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); + fn verify_services(&self, trace: &Value) -> Result<(), axum::BoxError> { + let actual_services: HashSet = trace + .select_path("$..resource.attributes..[?(@.key == 'service.name')].value.stringValue")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + let expected_services = self + .services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) + } + + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError> { + let operation_names: HashSet = trace + .select_path("$..spans..name")? + .into_iter() + .filter_map(|span_name| span_name.as_string()) + .collect(); + let mut span_names: HashSet<&str> = self.span_names.clone(); + if self.services.contains(&"client") { + span_names.insert("client_request"); + } + tracing::debug!("found spans {:?}", operation_names); + let missing_operation_names: Vec<_> = span_names + .iter() + .filter(|o| !operation_names.contains(**o)) + .collect(); + if !missing_operation_names.is_empty() { + return Err(BoxError::from(format!( + "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" + ))); + } + Ok(()) + } + + fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError> { + let kind = match kind { + "internal" => 1, + "client" => 3, + "server" => 2, + _ => panic!("unknown kind"), + }; + let binding1 = trace.select_path(&format!( + "$..spans..[?(@.kind == {})]..[?(@.key == 'otel.original_name')].value..[?(@ == '{}')]", + kind, name + ))?; + let binding2 = trace.select_path(&format!( + "$..spans..[?(@.kind == {} && @.name == '{}')]", + kind, name + ))?; + let binding = binding1.first().or(binding2.first()); + + if binding.is_none() { + return Err(BoxError::from(format!( + "span.kind missing or incorrect {}, {}", + name, kind + ))); + } + Ok(()) + } + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = + trace.select_path("$..[?(@.name == 'supergraph')]..[?(@.key == 'graphql.operation.name')].value.stringValue")?; + let operation_name = binding.first(); + assert_eq!( + operation_name + .expect("graphql.operation.name expected") + .as_str() + .expect("graphql.operation.name must be a string"), + expected_operation_name + ); + } + Ok(()) + } + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(psr) = self.priority_sampled { + let binding = trace.select_path( + "$..[?(@.name == 'execution')]..[?(@.key == 'sampling.priority')].value.intValue", + )?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); + } + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_i64() + .expect("psr not an integer") + .to_string(), + psr + ); + } + } else { + assert!(trace.select_path("$..[?(@.name == 'execution')]..[?(@.key == 'sampling.priority')].value.intValue")?.is_empty()) + } + Ok(()) } - Ok(()) } -fn validate_service_name(trace: Value) -> Result<(), BoxError> { - let service_name = - trace.select_path("$..resource.attributes[?(@.key=='service.name')].value.stringValue")?; - assert_eq!( - service_name.first(), - Some(&&Value::String("router".to_string())) - ); - Ok(()) +async fn mock_otlp_server() -> MockServer { + let mock_server = wiremock::MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/traces")) + .respond_with(ResponseTemplate::new(200).set_body_raw( + ExportTraceServiceResponse::default().encode_to_vec(), + "application/x-protobuf", + )) + .expect(1..) + .mount(&mock_server) + .await; + Mock::given(method("POST")) + .and(path("/metrics")) + .respond_with(ResponseTemplate::new(200).set_body_raw( + ExportMetricsServiceResponse::default().encode_to_vec(), + "application/x-protobuf", + )) + .expect(1..) + .mount(&mock_server) + .await; + mock_server } -async fn find_valid_metrics( - mock_server: &MockServer, - _query: &Value, - _operation_name: Option<&str>, - _services: &[&'static str], -) -> Result<(), BoxError> { - let requests = mock_server - .received_requests() +impl TraceSpec { + async fn validate_otlp_trace( + self, + router: &mut IntegrationTest, + mock_server: &MockServer, + query: Query, + ) -> Result<(), BoxError> { + OtlpTraceSpec { + trace_spec: self, + mock_server, + } + .validate_trace(router, query) + .await + } + async fn validate_otlp_metrics(self, mock_server: &MockServer) -> Result<(), BoxError> { + OtlpTraceSpec { + trace_spec: self, + mock_server, + } + .validate_metrics() .await - .expect("Could not get otlp requests"); - if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { - let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; - let json_trace = serde_json::to_value(metrics)?; - // For now just validate service name. - validate_service_name(json_trace)?; - - Ok(()) - } else { - Err(anyhow!("No metrics received").into()) } } diff --git a/apollo-router/tests/integration/telemetry/propagation.rs b/apollo-router/tests/integration/telemetry/propagation.rs index e458f1986c..9505efa558 100644 --- a/apollo-router/tests/integration/telemetry/propagation.rs +++ b/apollo-router/tests/integration/telemetry/propagation.rs @@ -3,6 +3,7 @@ use tower::BoxError; use crate::integration::common::graph_os_enabled; use crate::integration::common::IntegrationTest; +use crate::integration::common::Query; use crate::integration::common::Telemetry; #[tokio::test(flavor = "multi_thread")] @@ -12,8 +13,7 @@ async fn test_trace_id_via_header() -> Result<(), BoxError> { return Ok(()); } async fn make_call(router: &mut IntegrationTest, trace_id: &str) { - let _ = router.execute_query_with_headers(&json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name}}","variables":{}}), - [("id_from_header".to_string(), trace_id.to_string())].into()).await; + let _ = router.execute_query(Query::builder().body(json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name}}","variables":{}})).header("id_from_header".to_string(), trace_id.to_string()).build()).await; } let mut router = IntegrationTest::builder() diff --git a/apollo-router/tests/integration/telemetry/verifier.rs b/apollo-router/tests/integration/telemetry/verifier.rs new file mode 100644 index 0000000000..3fe9fdabbd --- /dev/null +++ b/apollo-router/tests/integration/telemetry/verifier.rs @@ -0,0 +1,159 @@ +use std::time::Duration; + +use anyhow::anyhow; +use opentelemetry_api::trace::SpanContext; +use opentelemetry_api::trace::TraceId; +use serde_json::Value; +use tower::BoxError; + +use crate::integration::common::Query; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; + +pub trait Verifier { + fn spec(&self) -> &TraceSpec; + async fn validate_trace( + &self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + let (id, response) = router.execute_query(query).await; + if let Some(spec_id) = &self.spec().trace_id { + assert_eq!(id.to_string(), *spec_id, "trace id"); + } + for _ in 0..20 { + if self.find_valid_trace(id).await.is_ok() { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_trace(id).await?; + let subgraph_context = router.subgraph_context(); + assert!(response.status().is_success()); + self.validate_subgraph(subgraph_context)?; + Ok(()) + } + + async fn validate_metrics(&self) -> Result<(), BoxError> { + for _ in 0..10 { + if self.find_valid_metrics().await.is_ok() { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_metrics().await?; + Ok(()) + } + + async fn find_valid_metrics(&self) -> Result<(), BoxError> { + unimplemented!("find_valid_metrics") + } + + fn validate_subgraph(&self, subgraph_context: SpanContext) -> Result<(), BoxError> { + self.validate_subgraph_priority_sampled(&subgraph_context)?; + self.validate_subgraph_sampled(&subgraph_context)?; + Ok(()) + } + fn validate_subgraph_sampled(&self, subgraph_context: &SpanContext) -> Result<(), BoxError> { + if let Some(sampled) = self.spec().priority_sampled { + assert_eq!( + subgraph_context.trace_state().get("psr"), + Some(sampled), + "subgraph psr" + ); + } + + Ok(()) + } + + fn validate_subgraph_priority_sampled( + &self, + subgraph_context: &SpanContext, + ) -> Result<(), BoxError> { + if let Some(sampled) = self.spec().subgraph_sampled { + assert_eq!(subgraph_context.is_sampled(), sampled, "subgraph sampled"); + } + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + async fn find_valid_trace(&self, trace_id: TraceId) -> Result<(), BoxError> { + // A valid trace has: + // * All three services + // * The correct spans + // * All spans are parented + // * Required attributes of 'router' span has been set + + // For now just validate service name. + let trace: Value = self.get_trace(trace_id).await?; + println!("trace: {}", trace_id); + self.verify_services(&trace)?; + println!("services verified"); + self.verify_spans_present(&trace)?; + println!("spans present verified"); + self.verify_measured_spans(&trace)?; + println!("measured spans verified"); + self.verify_operation_name(&trace)?; + println!("operation name verified"); + self.verify_priority_sampled(&trace)?; + println!("priority sampled verified"); + self.verify_version(&trace)?; + println!("version verified"); + self.verify_span_kinds(&trace)?; + println!("span kinds verified"); + self.verify_span_attributes(&trace)?; + println!("span attributes verified"); + Ok(()) + } + + async fn get_trace(&self, trace_id: TraceId) -> Result; + + fn verify_version(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { + for expected in &self.spec().measured_spans { + let measured = self.measured_span(trace, expected)?; + if !measured { + return Err(anyhow!("missing measured span {}", expected).into()); + } + } + for unexpected in &self.spec().unmeasured_spans { + let measured = self.measured_span(trace, unexpected)?; + if measured { + return Err(anyhow!("unexpected measured span {}", measured).into()); + } + } + Ok(()) + } + + fn measured_span(&self, trace: &Value, name: &str) -> Result; + + fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { + // Validate that the span.kind has been propagated. We can just do this for a selection of spans. + if self.spec().span_names.contains("router") { + self.validate_span_kind(trace, "router", "server")?; + } + + if self.spec().span_names.contains("supergraph") { + self.validate_span_kind(trace, "supergraph", "internal")?; + } + + if self.spec().span_names.contains("http_request") { + self.validate_span_kind(trace, "http_request", "client")?; + } + + Ok(()) + } + + fn verify_services(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError>; + + fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError>; + + fn verify_span_attributes(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError>; +} diff --git a/apollo-router/tests/integration/telemetry/zipkin.rs b/apollo-router/tests/integration/telemetry/zipkin.rs index c0d5e0a8d5..45f51620d0 100644 --- a/apollo-router/tests/integration/telemetry/zipkin.rs +++ b/apollo-router/tests/integration/telemetry/zipkin.rs @@ -1,15 +1,17 @@ extern crate core; use std::collections::HashSet; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; use opentelemetry_api::trace::TraceId; -use serde_json::json; use serde_json::Value; use tower::BoxError; +use crate::integration::common::Query; use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; @@ -24,22 +26,13 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_zipkin_trace(&mut router, Query::default()) + .await?; router.touch_config().await; router.assert_reloaded().await; } @@ -47,85 +40,119 @@ async fn test_basic() -> Result<(), BoxError> { Ok(()) } -async fn validate_trace( - id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let params = url::form_urlencoded::Serializer::new(String::new()) - .append_pair("service", services.first().expect("expected root service")) - .finish(); - - let url = format!("http://localhost:9411/api/v2/trace/{id}?{params}"); - for _ in 0..10 { - if find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await - .is_ok() - { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; +struct ZipkinTraceSpec { + trace_spec: TraceSpec, +} +impl Deref for ZipkinTraceSpec { + type Target = TraceSpec; + + fn deref(&self) -> &Self::Target { + &self.trace_spec } - find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await?; - Ok(()) } -async fn find_valid_trace( - url: &str, - _query: &Value, - _operation_name: Option<&str>, - services: &[&'static str], - _custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - - // For now just validate service name. - let trace: Value = reqwest::get(url) - .await - .map_err(|e| anyhow!("failed to contact zipkin; {}", e))? - .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - verify_trace_participants(&trace, services)?; +impl Verifier for ZipkinTraceSpec { + fn verify_span_attributes(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + fn verify_version(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } - Ok(()) + fn measured_span(&self, _trace: &Value, _name: &str) -> Result { + Ok(true) + } + + fn verify_span_kinds(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + fn verify_services(&self, trace: &Value) -> Result<(), axum::BoxError> { + let actual_services: HashSet = trace + .select_path("$..serviceName")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + + let expected_services = self + .trace_spec + .services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) + } + + fn verify_spans_present(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + fn validate_span_kind(&self, _trace: &Value, _name: &str, _kind: &str) -> Result<(), BoxError> { + Ok(()) + } + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = trace + .select_path("$..[?(@.name == 'supergraph')].tags..['graphql.operation.name']")?; + let operation_name = binding.first(); + assert_eq!( + operation_name + .expect("graphql.operation.name expected") + .as_str() + .expect("graphql.operation.name must be a string"), + expected_operation_name + ); + } + Ok(()) + } + + fn verify_priority_sampled(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + async fn get_trace(&self, trace_id: TraceId) -> Result { + let params = url::form_urlencoded::Serializer::new(String::new()) + .append_pair( + "service", + self.trace_spec + .services + .first() + .expect("expected root service"), + ) + .finish(); + + let id = trace_id.to_string(); + let url = format!("http://localhost:9411/api/v2/trace/{id}?{params}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))? + .json() + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))?; + Ok(value) + } + + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } } -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..serviceName")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); +impl TraceSpec { + async fn validate_zipkin_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + ZipkinTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await } - Ok(()) } diff --git a/apollo-router/tests/integration/traffic_shaping.rs b/apollo-router/tests/integration/traffic_shaping.rs index feb9a7e725..579cb2b2a5 100644 --- a/apollo-router/tests/integration/traffic_shaping.rs +++ b/apollo-router/tests/integration/traffic_shaping.rs @@ -6,6 +6,7 @@ use tower::BoxError; use wiremock::ResponseTemplate; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; use crate::integration::IntegrationTest; @@ -99,9 +100,13 @@ async fn test_router_timeout_operation_name_in_tracing() -> Result<(), BoxError> router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ - "query": "query UniqueName { topProducts { name } }" - })) + .execute_query( + Query::builder() + .body(json!({ + "query": "query UniqueName { topProducts { name } }" + })) + .build(), + ) .await; assert_eq!(response.status(), 504); let response = response.text().await?; diff --git a/apollo-router/tests/samples_tests.rs b/apollo-router/tests/samples_tests.rs index 7f06f1d5cc..5beba9d4b5 100644 --- a/apollo-router/tests/samples_tests.rs +++ b/apollo-router/tests/samples_tests.rs @@ -31,6 +31,8 @@ use wiremock::ResponseTemplate; pub(crate) mod common; pub(crate) use common::IntegrationTest; +use crate::common::Query; + fn main() -> Result> { let args = Arguments::from_args(); let mut tests = Vec::new(); @@ -497,7 +499,12 @@ impl TestExecution { writeln!(out, "header: {:?}\n", headers).unwrap(); let (_, response) = router - .execute_query_with_headers(&request, headers.clone()) + .execute_query( + Query::builder() + .body(request) + .headers(headers.clone()) + .build(), + ) .await; writeln!(out, "response headers: {:?}", response.headers()).unwrap(); diff --git a/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx b/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx index 23cd378332..8d306fc6d2 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx @@ -58,11 +58,16 @@ Consequently you can filter for these operations in Datadog APM: ## OTLP configuration -To export traces to Datadog via OTLP, you must do the following: -- Configure the Datadog agent to accept OTLP traces. -- Configure the router to send traces to the Datadog agent. +[OpenTelemetry protocol (OTLP)](https://opentelemetry.io/docs/specs/otel/protocol/) is the recommended protocol for transmitting telemetry, including traces, to Datadog. -To configure the Datadog agent, add OTLP configuration to your `datadog.yaml`. For example: +To setup traces to Datadog via OTLP, you must do the following: + +- Modify the default configuration of the Datadog Agent to accept OTLP traces from the router. +- Configure the router to send traces to the configured Datadog Agent. + +### Datadog Agent configuration + +To configure the Datadog Agent, add OTLP configuration to your `datadog.yaml`. For example: ```yaml title="datadog.yaml" otlp_config: @@ -72,26 +77,68 @@ otlp_config: endpoint: :4317 ``` -To configure the router, enable the [OTLP exporter](/router/configuration/telemetry/exporters/tracing/otlp) and set `endpoint: `. For example: +For additional Datadog Agent configuration details, review Datadog's [Enabling OTLP Ingestion on the Datadog Agent](https://docs.datadoghq.com/opentelemetry/interoperability/otlp_ingest_in_the_agent/?tab=host#enabling-otlp-ingestion-on-the-datadog-agent) documentation. + +### Router configuration + +To configure the router, enable the [OTLP exporter](./otlp) and set `endpoint: `. For example: ```yaml title="router.yaml" telemetry: exporters: tracing: + common: + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. + preview_datadog_agent_sampling: true + sampler: 0.1 + otlp: enabled: true - # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:4317) endpoint: "${env.DATADOG_AGENT_HOST}:4317" + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 +``` + +Adjusting the `sampler` controls the sampling decisions that the router makes on its own and decreases the rate at which you sample. Your sample rate can have a direct impact on your Datadog bill. + + + +If you see warning messages from the router regarding the batch span processor, you may need to adjust your `batch_processor` settings in your `exporter` config to match the volume of spans being created in a router instance. This applies to both OTLP and the Datadog native exporters. + + + +### Enabling Datadog Agent sampling + +The Datadog APM view relies on traces to generate metrics. For these metrics to be accurate, all requests must be sampled and sent to the Datadog agent. +To prevent all traces from being sent to Datadog, in your router you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. + sampler: 0.1 + preview_datadog_agent_sampling: true ``` -For more details about Datadog configuration, see [Datadog Agent configuration](https://docs.datadoghq.com/opentelemetry/otlp_ingest_in_the_agent/?tab=host). + + + - The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + + - Configuring `traces_per_second` in the Datadog Agent will not dynamically adjust the router's sampling rate to meet the target rate. + + - Using `preview_datadog_agent_sampling` will send _all_ spans to the Datadog Agent. This will have an impact on the resource usage and performance of both the router and Datadog Agent. + + ### Enabling log correlation To enable Datadog log correlation, you must configure `dd.trace_id` to appear on the `router` span: - + ```yaml title="router.yaml" telemetry: instrumentation: @@ -118,10 +165,19 @@ The router can be configured to connect to either the native, default Datadog ag telemetry: exporters: tracing: - datadog: - enabled: true - # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:8126) - endpoint: "http://${env.DATADOG_AGENT_HOST}:8126" + common: + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. + preview_datadog_agent_sampling: true + sampler: 0.1 + + datadog: + enabled: true + # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:8126) + endpoint: "http://${env.DATADOG_AGENT_HOST}:8126" + + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 # Enable graphql.operation.name attribute on supergraph spans. instrumentation: @@ -132,6 +188,12 @@ telemetry: graphql.operation.name: true ``` + + +Depending on the volume of spans being created in a router instance, it will be necessary to adjust the `batch_processor` settings in your `exporter` config. This applies to both OTLP and the Datadog native exporter. + + + ### `enabled` Set to true to enable the Datadog exporter. Defaults to false. @@ -273,11 +335,11 @@ If you have introduced a new span in a custom build of the Router you can enable telemetry: exporters: tracing: - datadog: - batch_processor: + datadog: + batch_processor: max_export_batch_size: 512 max_concurrent_exports: 1 - max_export_timeout: 30s + max_export_timeout: 30s max_queue_size: 2048 scheduled_delay: 5s ``` @@ -297,3 +359,108 @@ telemetry: | `resource_mapping` | See [config](#resource_mapping) | A map of span names to attribute names. | | `span_metrics` | See [config](#span_metrics) | A map of span names to boolean. | +## Sampler configuration + +When using Datadog to gain insight into your router's performance, you need to decide whether to use the Datadog APM view or rely on OTLP metrics. +The Datadog APM view is driven by traces. In order for this view to be accurate, all requests must be sampled and sent to the Datadog Agent. + +Tracing is expensive both in terms of APM costs and router performance, so you typically will want to set the `sampler` to sample at low rates in production environments. +This, however, impacts the APM view, which will show only a small percentage of traces. + +To mitigate this, you can use Datadog Agent sampling mode, where _all_ traces are sent to the Datadog Agent but only a percentage of them are forwarded to Datadog. This keeps the APM view accurate while lowering costs. Note that the router will incur a performance cost of having an effective sample rate of 100%. + +Use the following guidelines on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: + +**I want the APM view to show metrics for 100% of traffic, and I am OK with the performance impact on the router.** + +Set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # All requests will be traced and sent to the Datadog agent. + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. + preview_datadog_agent_sampling: true + sampler: 0.1 +``` + +**I want the Datadog Agent to be in control of the percentage of traces sent to Datadog.** + +Use the Datadog Agent's `probabalistic_sampling` option sampler and set the `sampler` to `always_on` to allow the agent to control the sampling rate. + +Router config: +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # All requests will be traced and sent to the Datadog agent. + sampler: always_on +``` + +Datadog agent config: +```yaml +otlp_config: + traces: + probabilistic_sampling: + # Only 10 percent of spans will be forwarded to Datadog + sampling_percentage: 10 +``` + +**I want the best performance from the router and I'm not concerned with the APM view. I use metrics and traces to monitor my application.** + +Set the `sample` to a low value to reduce the number of traces sent to Datadog. Leave `preview_datadog_agent_sampling` to `false`. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of requests will be traced and sent to the Datadog agent. The APM view will only show a subset of total request data but the Router will perform better. + sampler: 0.1 + preview_datadog_agent_sampling: false +``` + +### `sampler` (default: `always_on`) + +The `sampler` configuration allows you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your Datadog bill. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded to the Datadog agent. Experiment to find a value that is good for you! + sampler: 0.1 +``` + +If you are using the Datadog APM viw then you should set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +### `preview_datadog_agent_sampling` (default: `false`) + +The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true + sampler: 0.1 +``` + +Using `preview_datadog_agent_sampling` will send _all_ spans to the Datadog Agent, but only the percentage of traces configured by the `sampler` will be forwarded to Datadog. This means that your APM view will be accurate, but it will incur performance and resource usage costs for both the router and Datadog Agent to send and receive all spans. + +If your use case allows your APM view to show only a subset of traces, then you can set `preview_datadog_agent_sampling` to `false`. You should alternatively rely on OTLP metrics to gain insight into the router's performance. + + + +- The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + +- Configuring `traces_per_second` in the Datadog Agent will not dynamically adjust the router's sampling rate to meet the target rate. + + diff --git a/docs/source/reference/router/telemetry/trace-exporters/overview.mdx b/docs/source/reference/router/telemetry/trace-exporters/overview.mdx index 8656a528f2..76f54d7e4d 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/overview.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/overview.mdx @@ -114,6 +114,33 @@ telemetry: - `parent_based_sampler` enables clients to make the sampling decision. This guarantees that a trace that starts at a client will also have spans at the router. You may wish to disable it (setting `parent_based_sampler: false`) if your router is exposed directly to the internet. + + +### `preview_datadog_agent_sampling` + + + + + + +Enable accurate Datadog APM views with the `preview_datadog_agent_sampling` option. + +The Datadog APM view relies on traces to generate metrics. For this to be accurate, all requests must be sampled and sent to the Datadog Agent. + +To both enable accurate APM views and prevent _all_ traces from being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + preview_datadog_agent_sampling: true +``` + +To learn more details and limitations about this option, go to [`preview_datadog_agent_sampling`](/router/configuration/telemetry/exporters/tracing/datadog#preview_datadog_agent_sampling) in DataDog trace exporter docs. + ### `propagation` The `telemetry.exporters.tracing.propagation` section allows you to configure which propagators are active in addition to those automatically activated by using an exporter. @@ -238,17 +265,21 @@ Using this configuration you will have a response header called `my-trace-id` co ## Tracing common reference -| Attribute | Default | Description | -|----------------------------------|--------------------------|-------------------------------------------------| -| `service_name` | `unknown_service:router` | The OpenTelemetry service name. | -| `service_namespace` | | The OpenTelemetry namespace. | -| `resource` | | The OpenTelemetry resource to attach to traces. | -| `experimental_response_trace_id` | | Return the trace ID in a response header. | -| `max_attributes_per_event` | 128 | The maximum number of attributes per event. | -| `max_attributes_per_link` | 128 | The maximum number of attributes per link. | -| `max_attributes_per_span` | 128 | The maximum number of attributes per span. | -| `max_events_per_span` | 128 | The maximum number of events per span. | -| `max_links_per_span` | 128 | The maximum links per span. | +| Attribute | Default | Description | +|----------------------------------|--------------------------|--------------------------------------------------| +| `parent_based_sampler` | `true` | Sampling decisions from upstream will be honored | +| `preview_datadog_agent_sampling` | `false` | Send all spans to the Datadog agent. | +| `propagation` | | The propagation configuration. | +| `sampler` | `always_on` | The sampling rate for traces. | +| `service_name` | `unknown_service:router` | The OpenTelemetry service name. | +| `service_namespace` | | The OpenTelemetry namespace. | +| `resource` | | The OpenTelemetry resource to attach to traces. | +| `experimental_response_trace_id` | | Return the trace ID in a response header. | +| `max_attributes_per_event` | 128 | The maximum number of attributes per event. | +| `max_attributes_per_link` | 128 | The maximum number of attributes per link. | +| `max_attributes_per_span` | 128 | The maximum number of attributes per span. | +| `max_events_per_span` | 128 | The maximum number of events per span. | +| `max_links_per_span` | 128 | The maximum links per span. | ## Related topics