From 602d67e2ddaf51b752cf096fcd70a90ce6bc7ab3 Mon Sep 17 00:00:00 2001 From: "Kamil Mankowski (kam193)" Date: Wed, 18 Sep 2024 21:45:50 +0200 Subject: [PATCH] Extended support for attributions --- file-similarity/VERSION | 2 +- file-similarity/service/al_run.py | 23 +++++++++++++---- file-similarity/service/helpers.py | 4 ++- file-similarity/service/updater.py | 40 +++++++++++++++++++++--------- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/file-similarity/VERSION b/file-similarity/VERSION index 45a4fb7..ec63514 100644 --- a/file-similarity/VERSION +++ b/file-similarity/VERSION @@ -1 +1 @@ -8 +9 diff --git a/file-similarity/service/al_run.py b/file-similarity/service/al_run.py index a464c74..b008423 100644 --- a/file-similarity/service/al_run.py +++ b/file-similarity/service/al_run.py @@ -56,7 +56,15 @@ def _load_tlsh_data_from_csv(self, path: str): for row in reader: t = tlsh.Tlsh() t.fromTlshStr(row["tlsh"]) - self.tlsh_data[row["file_type"]].add(TLSHData(t, row["reference"], row.get("attribution.campaign", ""))) + self.tlsh_data[row["file_type"]].add( + TLSHData( + t, + row["reference"], + row.get("attribution.campaign", "").split(","), + row.get("attribution.family", "").split(","), + row.get("attribution.actor", "").split(","), + ) + ) hashes_count += 1 self.log.info(f"Loaded {hashes_count} TLSH hashes for {len(self.tlsh_data)} extensions") @@ -151,12 +159,17 @@ def execute(self, request: ServiceRequest) -> None: for similar in similars: similar: TLSHResult main_section.add_line(f"({similar.distance}) {similar.similar_to.hash.hexdigest()}") - main_section.add_line(f" {similar.similar_to.reference}") - if similar.similar_to.campaigns: - main_section.add_tag("attribution.campaign", similar.similar_to.campaigns) + main_section.add_line(f" {similar.similar_to.reference}") + for campaign in similar.similar_to.campaigns or []: + main_section.add_tag("attribution.campaign", campaign) + for family in similar.similar_to.families or []: + main_section.add_tag("attribution.family", family) + for actor in similar.similar_to.actors or []: + main_section.add_tag("attribution.actor", actor) + main_section.set_heuristic( HEURISTIC_BY_SEVERITY[severity], - signature=f"similarity/tlsh/{severity.value}", + signature=f"file-similarity.{severity.value}", ) result.add_section(main_section) diff --git a/file-similarity/service/helpers.py b/file-similarity/service/helpers.py index da2114e..98d6250 100644 --- a/file-similarity/service/helpers.py +++ b/file-similarity/service/helpers.py @@ -9,7 +9,9 @@ class TLSHData: hash: tlsh.Tlsh reference: str - campaigns: str = None + campaigns: list[str] = None + families: list[str] = None + actors: list[str] = None def get_distance(self, hash: tlsh.Tlsh): return self.hash.diff(hash) diff --git a/file-similarity/service/updater.py b/file-similarity/service/updater.py index 7102f6d..779f82b 100644 --- a/file-similarity/service/updater.py +++ b/file-similarity/service/updater.py @@ -13,7 +13,14 @@ BADLIST_NAME = "Badlist" BADLIST_QUERY = "hashes.tlsh:* AND enabled:true" -HEADERS = ["tlsh", "file_type", "reference", "attribution.campaign"] +HEADERS = [ + "tlsh", + "file_type", + "reference", + "attribution.campaign", + "attribution.family", + "attribution.actor", +] HASH_FILE_NAME = "hashes.csv" @@ -50,9 +57,10 @@ def _load_hashes_set(self, file_path: str) -> set[str]: return hashes def _describe_source(self, source: Source) -> str: - reason = ( - f" ({self._safe_get(source, 'reason')})" if self._safe_get(source, "reason") else "" - ) + reason = ", ".join(self._safe_get(source, "reason") or []) + if reason: + reason = f" ({reason})" + return f"{self._safe_get(source, 'name') or ''}{reason}" def _update_badlist(self): @@ -69,9 +77,9 @@ def _update_badlist(self): hashes = set() self.push_status("UPDATING", "Pulling currently badlisted files..") # TODO: streaming results and configurable limit - results: Iterable[Badlist] = self.datastore.badlist.search(BADLIST_QUERY, rows=10000).get( - "items", [] - ) + results: Iterable[Badlist] = self.datastore.badlist.search( + BADLIST_QUERY, fl="*", rows=10000 + ).get("items", []) with tempfile.TemporaryDirectory() as tmpdir, open(f"{tmpdir}/{HASH_FILE_NAME}", "w+") as f: writer = csv.DictWriter(f, fieldnames=HEADERS) @@ -88,23 +96,31 @@ def _update_badlist(self): try: t.fromTlshStr(result.hashes.tlsh) except ValueError: - self.log.warning( - "Invalid TLSH hash found in Badlist [%s]", result.hashes.tlsh, exc_info=True - ) + # self.log.warning( + # "Invalid TLSH hash found in Badlist [%s]", result.hashes.tlsh, exc_info=True + # ) continue if result.hashes.tlsh in hashes: continue hashes.add(result.hashes.tlsh) sources = self._safe_get(result, "sources") or [] - reference = f"Marked by ({len(sources)}: {', '.join(self._describe_source(source) for source in sources)})" + reference = ( + f"Marked by {', '.join(self._describe_source(source) for source in sources)}" + ) + self.log.info(self._safe_get(result, "attribution")) + self.log.info(result.as_primitives()) campaigns = self._safe_get(result, "attribution.campaign") + family = self._safe_get(result, "attribution.family") + actor = self._safe_get(result, "attribution.actor") writer.writerow( { "tlsh": result.hashes.tlsh, "file_type": type_, "reference": reference, - "attribution.campaign": campaigns, + "attribution.campaign": ",".join(campaigns) if campaigns else None, + "attribution.family": ",".join(family) if family else None, + "attribution.actor": ",".join(actor) if actor else None, } ) self.log.info(f"Loaded {len(hashes)} TLSH hashes")