From 6774cce09995cb874bab32bd0c980b4e271c5093 Mon Sep 17 00:00:00 2001 From: Jan-Philipp Litza Date: Tue, 4 Jan 2022 11:04:03 +0100 Subject: [PATCH] Support multiple regexp matches per line By using the match group's indices, this also avoids replacing unrelated chunks of the line. --- anonip.py | 47 ++++++++++++++++++++++++++++++----------------- tests.py | 6 ++++++ 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/anonip.py b/anonip.py index 727f422..d51c150 100755 --- a/anonip.py +++ b/anonip.py @@ -180,6 +180,35 @@ def process_ip(self, ip): ) return trunc_ip + def process_regex_match(self, match): + """ + This function processes a single regex match. + + It returns the anonymized match as string and can be called with re.sub. + + :param match: re.Match + :return: str + """ + ret = [] + last_pos = 0 + + for i, g in enumerate(match.groups(), start=1): + if not g: + continue + ip_str, ip = self.extract_ip(g) + replacement = ( + self.process_ip(ip) if ip + else self.replace or g + ) + ret.extend(( + match.group(0)[last_pos:match.start(i) - match.start(0)], + str(replacement), + )) + last_pos = match.end(i) - match.start(0) + + ret.append(match[0][last_pos:]) + return "".join(ret) + def process_line_regex(self, line): """ This function processes a single line based on the provided regex. @@ -189,23 +218,7 @@ def process_line_regex(self, line): :param line: str :return: str """ - match = re.match(self.regex, line) - if not match: - logger.debug("Regex did not match!") - return line - groups = match.groups() - - for m in set(groups): - if not m: - continue - ip_str, ip = self.extract_ip(m) - if ip: - trunc_ip = self.process_ip(ip) - line = line.replace(ip_str, str(trunc_ip)) - elif self.replace: - line = line.replace(m, self.replace) - - return line + return re.sub(self.regex, self.process_regex_match, line) def process_line_column(self, line): """ diff --git a/tests.py b/tests.py index d54fac1..4f73a4a 100755 --- a/tests.py +++ b/tests.py @@ -140,6 +140,12 @@ def test_column(line, columns, expected): '3.3.0.0 - - [20/May/2015:21:05:01 +0000] "GET /723.3.3.357 HTTP/1.1" 200 13358 "-" "useragent"', None, ), + ( + '3.3.3.3 - - [20/May/2015:21:05:01 +0000] "GET /723.3.3.357 HTTP/1.1" 200 13358 "-" "useragent [ip:1.2.3.4]"', + re.compile(r"\b([0-9a-fA-F][0-9a-fA-F:\.]*|::[0-9a-fA-F:\.]+)\b"), + '3.3.0.0 - - [20/May/2015:21:05:01 +0000] "GET /723.3.3.357 HTTP/1.1" 200 13358 "-" "useragent [ip:1.2.0.0]"', + None, + ), ( "blabla/ 3.3.3.3 /blublu", re.compile(r"^blabla/ ([^,]+) /blublu"),