Skip to content

Commit

Permalink
chore(inc-984): Add basic PII detector (#6712)
Browse files Browse the repository at this point in the history
In INC-984 we had an incident where IP addresses were copied into a new
unscrubbed field. It took us too long to detect that issue.

This metric could be used in combination with anomaly detection to
detect such faulty rollouts more quickly, but not prevent them.

TBD is who actually gets those alerts and how we should react to them.
  • Loading branch information
untitaker authored Jan 2, 2025
1 parent e45fa84 commit d78add3
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 15 deletions.
29 changes: 15 additions & 14 deletions rust_snuba/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rust_snuba/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ zstd = "0.12.3"
serde_with = "3.8.1"
seq-macro = "0.3"
sentry_arroyo = "2.19.5"
regex = "1.11.1"


[dev-dependencies]
Expand Down
52 changes: 51 additions & 1 deletion rust_snuba/src/strategies/processor.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use std::collections::BTreeMap;
use std::sync::Arc;
use std::sync::{Arc, LazyLock};

use chrono::{DateTime, Utc};
use regex::Regex;
use sentry::{Hub, SentryFutureExt};
use sentry_arroyo::backends::kafka::types::KafkaPayload;
use sentry_arroyo::counter;
Expand Down Expand Up @@ -217,6 +218,8 @@ impl<TResult: Clone, TNext: Clone> MessageProcessor<TResult, TNext> {
return Err(maybe_err);
};

record_message_stats(payload);

self.process_payload(msg).map_err(|error| {
counter!("invalid_message");

Expand Down Expand Up @@ -335,6 +338,30 @@ fn _validate_schema(
}
}

static IP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
let ipv4 = r"(?x)(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\.
(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\.
(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\.
(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])";

Regex::new(ipv4).unwrap()
});

fn record_message_stats(payload: &[u8]) {
// for context see INC-984 -- the idea is to have a canary metric for whenever we inadvertently
// start ingesting much more PII into our systems. this does not prevent PII leakage but might
// improve our response time to the leak.
if let Ok(string) = std::str::from_utf8(payload) {
if IP_REGEX.is_match(string) {
counter!("message_stats.has_ipv4_pattern", 1, "outcome" => "true");
} else {
counter!("message_stats.has_ipv4_pattern", 1, "outcome" => "false");
}
} else {
counter!("message_stats.has_ipv4_pattern", 1, "outcome" => "invalid_payload");
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -386,4 +413,27 @@ mod tests {
strategy.submit(message).unwrap(); // Does not error
let _ = strategy.join(None);
}

#[test]
fn test_ip_addresses() {
let test_cases = [
("192.168.0.1", true),
("255.255.255.255", true),
("0.0.0.0", true),
("2001:0db8:85a3:0000:0000:8a2e:0370:7334", false), // Valid IPv6, but we don't handle IPv6 yet
("::1", false),
("fe80::1%lo0", false),
("invalid192.168.0.1", true),
("invalid", false),
];

for (address, is_ipv4) in test_cases {
assert_eq!(
IP_REGEX.is_match(address),
is_ipv4,
"{} failed IPv4 validation",
address
);
}
}
}

0 comments on commit d78add3

Please sign in to comment.