diff --git a/prowler/providers/aws/services/cloudwatch/cloudwatch_log_group_no_critical_pii_in_logs/cloudwatch_log_group_no_critical_pii_in_logs.py b/prowler/providers/aws/services/cloudwatch/cloudwatch_log_group_no_critical_pii_in_logs/cloudwatch_log_group_no_critical_pii_in_logs.py index 5e549eb394c..6d689f541a3 100644 --- a/prowler/providers/aws/services/cloudwatch/cloudwatch_log_group_no_critical_pii_in_logs/cloudwatch_log_group_no_critical_pii_in_logs.py +++ b/prowler/providers/aws/services/cloudwatch/cloudwatch_log_group_no_critical_pii_in_logs/cloudwatch_log_group_no_critical_pii_in_logs.py @@ -30,6 +30,10 @@ def execute(self): "US_PASSPORT", ], ) + + # Maximum character count for each chunk + MAX_CHUNK_CHAR_COUNT = 50000 # Adjust based on performance needs + pii_language = logs_client.audit_config.get("pii_language", "en") for log_group in logs_client.log_groups.values(): report = Check_Report_AWS(self.metadata()) @@ -43,6 +47,9 @@ def execute(self): report.resource_tags = log_group.tags log_group_pii = [] + if log_group.name == "aws-waf-logs-prowler-pro-saas-dev-log-group": + pass + if log_group.log_streams: for log_stream_name in log_group.log_streams: log_stream_pii = {} @@ -51,12 +58,71 @@ def execute(self): for event in log_group.log_streams[log_stream_name] ] - # Process log data in chunks - chunk_size = 50000 # Adjust chunk size based on performance - for i in range(0, len(log_stream_events), chunk_size): - chunk = "\n".join(log_stream_events[i : i + chunk_size]) + # Chunk the log_stream_events to avoid exceeding the PII detection character limit + chunk = "" + cumulative_char_count = 0 + chunk_start_index = 0 # Track starting index of the chunk in the log_stream_events + + for i, log_event in enumerate(log_stream_events): + # Check if adding this log_event would exceed the chunk character limit + if len(chunk) + len(log_event) + 1 > MAX_CHUNK_CHAR_COUNT: + # Perform PII detection on the current chunk + pii_detection_result = analyzer.analyze( + text=chunk, + entities=critical_pii_entities, + score_threshold=1, + language=pii_language, + ) + + # Track cumulative character count to map PII to log event within this chunk + cumulative_char_count = 0 + for j, log_event_chunk in enumerate( + log_stream_events[chunk_start_index:i] + ): + log_event_length = len(log_event_chunk) + for pii in pii_detection_result: + # Check if PII start position falls within this log event + if ( + cumulative_char_count + <= pii.start + < cumulative_char_count + log_event_length + ): + flagged_event = log_group.log_streams[ + log_stream_name + ][chunk_start_index + j] + cloudwatch_timestamp = ( + convert_to_cloudwatch_timestamp_format( + flagged_event["timestamp"] + ) + ) + if ( + cloudwatch_timestamp + not in log_stream_pii + ): + log_stream_pii[cloudwatch_timestamp] = ( + SecretsDict() + ) + + # Add the detected PII entity to log_stream_pii + log_stream_pii[ + cloudwatch_timestamp + ].add_secret( + pii.start - cumulative_char_count, + pii.entity_type, + ) + cumulative_char_count += ( + log_event_length + 1 + ) # +1 to account for '\n' + + # Reset the chunk and counters for the next batch + chunk = "" + chunk_start_index = i + + # Append current log event to chunk + chunk += log_event + "\n" - # PII detection for each chunk + # Handle the final chunk if it has remaining content + if chunk: pii_detection_result = analyzer.analyze( text=chunk, entities=critical_pii_entities, @@ -64,14 +130,12 @@ def execute(self): language=pii_language, ) - # Track cumulative character count to map PII to log event cumulative_char_count = 0 - for j, log_event in enumerate( - log_stream_events[i : i + chunk_size] + for j, log_event_chunk in enumerate( + log_stream_events[chunk_start_index:] ): - log_event_length = len(log_event) + log_event_length = len(log_event_chunk) for pii in pii_detection_result: - # Check if PII start position falls within this log event if ( cumulative_char_count <= pii.start @@ -79,7 +143,7 @@ def execute(self): ): flagged_event = log_group.log_streams[ log_stream_name - ][j] + ][chunk_start_index + j] cloudwatch_timestamp = ( convert_to_cloudwatch_timestamp_format( flagged_event["timestamp"] @@ -90,14 +154,13 @@ def execute(self): SecretsDict() ) - # Add the detected PII entity to log_stream_pii log_stream_pii[cloudwatch_timestamp].add_secret( pii.start - cumulative_char_count, pii.entity_type, ) cumulative_char_count += ( log_event_length + 1 - ) # +1 to account for '\n' + ) # +1 for '\n' if log_stream_pii: pii_string = "; ".join(