Skip to content

Commit

Permalink
fix(cloudwatch): limit chunk for 50k characters
Browse files Browse the repository at this point in the history
  • Loading branch information
puchy22 committed Oct 30, 2024
1 parent aae8f25 commit 22265e7
Showing 1 changed file with 76 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ def execute(self):
"US_PASSPORT",
],
)

# Maximum character count for each chunk
MAX_CHUNK_CHAR_COUNT = 50000 # Adjust based on performance needs

pii_language = logs_client.audit_config.get("pii_language", "en")
for log_group in logs_client.log_groups.values():
report = Check_Report_AWS(self.metadata())
Expand All @@ -43,6 +47,9 @@ def execute(self):
report.resource_tags = log_group.tags
log_group_pii = []

if log_group.name == "aws-waf-logs-prowler-pro-saas-dev-log-group":
pass

if log_group.log_streams:
for log_stream_name in log_group.log_streams:
log_stream_pii = {}
Expand All @@ -51,35 +58,92 @@ def execute(self):
for event in log_group.log_streams[log_stream_name]
]

# Process log data in chunks
chunk_size = 50000 # Adjust chunk size based on performance
for i in range(0, len(log_stream_events), chunk_size):
chunk = "\n".join(log_stream_events[i : i + chunk_size])
# Chunk the log_stream_events to avoid exceeding the PII detection character limit
chunk = ""
cumulative_char_count = 0
chunk_start_index = 0 # Track starting index of the chunk in the log_stream_events

for i, log_event in enumerate(log_stream_events):
# Check if adding this log_event would exceed the chunk character limit
if len(chunk) + len(log_event) + 1 > MAX_CHUNK_CHAR_COUNT:
# Perform PII detection on the current chunk
pii_detection_result = analyzer.analyze(
text=chunk,
entities=critical_pii_entities,
score_threshold=1,
language=pii_language,
)

# Track cumulative character count to map PII to log event within this chunk
cumulative_char_count = 0
for j, log_event_chunk in enumerate(
log_stream_events[chunk_start_index:i]
):
log_event_length = len(log_event_chunk)
for pii in pii_detection_result:
# Check if PII start position falls within this log event
if (
cumulative_char_count
<= pii.start
< cumulative_char_count + log_event_length
):
flagged_event = log_group.log_streams[
log_stream_name
][chunk_start_index + j]
cloudwatch_timestamp = (
convert_to_cloudwatch_timestamp_format(
flagged_event["timestamp"]
)
)
if (
cloudwatch_timestamp
not in log_stream_pii
):
log_stream_pii[cloudwatch_timestamp] = (
SecretsDict()
)

# Add the detected PII entity to log_stream_pii
log_stream_pii[
cloudwatch_timestamp
].add_secret(
pii.start - cumulative_char_count,
pii.entity_type,
)
cumulative_char_count += (
log_event_length + 1
) # +1 to account for '\n'

# Reset the chunk and counters for the next batch
chunk = ""
chunk_start_index = i

# Append current log event to chunk
chunk += log_event + "\n"

# PII detection for each chunk
# Handle the final chunk if it has remaining content
if chunk:
pii_detection_result = analyzer.analyze(
text=chunk,
entities=critical_pii_entities,
score_threshold=1,
language=pii_language,
)

# Track cumulative character count to map PII to log event
cumulative_char_count = 0
for j, log_event in enumerate(
log_stream_events[i : i + chunk_size]
for j, log_event_chunk in enumerate(
log_stream_events[chunk_start_index:]
):
log_event_length = len(log_event)
log_event_length = len(log_event_chunk)
for pii in pii_detection_result:
# Check if PII start position falls within this log event
if (
cumulative_char_count
<= pii.start
< cumulative_char_count + log_event_length
):
flagged_event = log_group.log_streams[
log_stream_name
][j]
][chunk_start_index + j]
cloudwatch_timestamp = (
convert_to_cloudwatch_timestamp_format(
flagged_event["timestamp"]
Expand All @@ -90,14 +154,13 @@ def execute(self):
SecretsDict()
)

# Add the detected PII entity to log_stream_pii
log_stream_pii[cloudwatch_timestamp].add_secret(
pii.start - cumulative_char_count,
pii.entity_type,
)
cumulative_char_count += (
log_event_length + 1
) # +1 to account for '\n'
) # +1 for '\n'

if log_stream_pii:
pii_string = "; ".join(
Expand Down

0 comments on commit 22265e7

Please sign in to comment.