From 874f8dee4db6c1ed9988e8591c00e0fbebef4d9e Mon Sep 17 00:00:00 2001 From: Jean-Louis Huynen Date: Mon, 5 Feb 2024 16:11:49 +0100 Subject: [PATCH] chg: [library] bloomfilter absclass, checking entries againt PSS files --- private_search_set/bloom_filter_base.py | 28 +++++++ private_search_set/bloom_filter_dcso.py | 26 ++++++ private_search_set/cli.py | 23 +++++- private_search_set/main.py | 105 +++++++++++++++++++++--- 4 files changed, 168 insertions(+), 14 deletions(-) create mode 100644 private_search_set/bloom_filter_base.py create mode 100644 private_search_set/bloom_filter_dcso.py diff --git a/private_search_set/bloom_filter_base.py b/private_search_set/bloom_filter_base.py new file mode 100644 index 0000000..1e0f33b --- /dev/null +++ b/private_search_set/bloom_filter_base.py @@ -0,0 +1,28 @@ +from abc import ABC, abstractmethod + +class BloomFilterBase(ABC): + @abstractmethod + def __init__(self, parameters): + """Initialize the Bloom filter with given parameters.""" + loaded = False + pass + + @abstractmethod + def add(self, data): + """Add data to the Bloom filter.""" + pass + + @abstractmethod + def check(self, data): + """Load a Bloom filter from file.""" + pass + + @abstractmethod + def load(self, data): + """Load a Bloom filter from file.""" + pass + + @abstractmethod + def write(self): + """Write the serialized bloom filter to a file descriptor.""" + pass diff --git a/private_search_set/bloom_filter_dcso.py b/private_search_set/bloom_filter_dcso.py new file mode 100644 index 0000000..5f678fe --- /dev/null +++ b/private_search_set/bloom_filter_dcso.py @@ -0,0 +1,26 @@ +from flor import BloomFilter +from private_search_set.bloom_filter_base import BloomFilterBase + +class BloomFilterDCSO(BloomFilterBase): + def __init__(self, parameters): + super().__init__(parameters) + self.bf = BloomFilter(n=parameters['capacity'], p=parameters['fp-probability']) + + def add(self, data): + self.bf.add(data) + pass + + def check(self, data): + return data in self.bf + + def load(self, fd): + self.bf.read(fd) + if self.bf.N == 0: + self.loaded = False + else: + self.loaded = True + pass + + def write(self, fd): + self.bf.write(fd) + pass \ No newline at end of file diff --git a/private_search_set/cli.py b/private_search_set/cli.py index 03c9d9e..8dae371 100644 --- a/private_search_set/cli.py +++ b/private_search_set/cli.py @@ -1,4 +1,6 @@ import click + +import pdb from private_search_set.main import PrivateSearchSet @click.pass_context @@ -8,18 +10,31 @@ def ingest_stdin(ctx): pss.ingest_stdin() pss.write_to_files(ctx.params["pss_home"]) +@click.pass_context +def check_stdin(ctx): + pss = ctx.obj + pss.check_stdin() @click.command() @click.option('--pss-home', required=True, type=click.Path(exists=False) , help='PSS working folder.') -@click.option('--json-file', required=True, type=click.Path(exists=True), help='Path to the PSS JSON file.') -@click.option('--ingest', required=True, type=click.BOOL , help='ingest stdin to PSS file') +@click.option('--json-file', required=False, type=click.Path(exists=True), help='Path to the PSS JSON file.') +@click.option('--ingest', required=True, type=click.BOOL , help='ingest stdin to PSS files') @click.option('--debug/--no-debug', default=False) @click.pass_context def cli(ctx, json_file, pss_home, ingest, debug): - ctx.obj = PrivateSearchSet.load_from_json_specs(json_file) - ctx.obj.init_filter() + if json_file and pss_home: + # TODO maybe allow conversion in the future + raise ValueError("Please provide either a json-file or a pss-home, not both.") + # If a json-file with PSS metadata is provided, load the PSS from the JSON file + if json_file: + ctx.obj = PrivateSearchSet.load_from_json_specs(json_file) + # If pss_home is provided, load the PSS from the files in the folder + if pss_home: + ctx.obj = PrivateSearchSet.load_from_pss_home(pss_home) if ingest: ingest_stdin() + else: + check_stdin() pass def main(): diff --git a/private_search_set/main.py b/private_search_set/main.py index 4f879da..b7da7ce 100644 --- a/private_search_set/main.py +++ b/private_search_set/main.py @@ -2,7 +2,7 @@ import os import sys import hashlib -from flor import BloomFilter +from private_search_set.bloom_filter_dcso import BloomFilterDCSO class PrivateSearchSet: def __init__(self, algorithm, bloomfilter, canonicalization_format, description, generated_timestamp, keyid, misp_attribute_types, version): @@ -31,26 +31,111 @@ def load_from_json_specs(json_file): data = {k.replace('-', '_'): v for k, v in json_data.items()} pss = PrivateSearchSet(**data) # Create an instance of the PrivateSearchSet class if set(data.keys()) == set(pss.__dict__.keys()): - PrivateSearchSet.print_private_search_set(pss) + # PrivateSearchSet.print_private_search_set(pss) + pss.init_filter_and_set() return pss else: raise ValueError("JSON file does not match the expected format.") - def init_filter(self): + def load_from_pss_home(pss_home): + if os.path.exists(pss_home): + file_path = os.path.join(pss_home, 'private-search-set.json') + if os.path.exists(file_path): + pss = PrivateSearchSet.load_from_json_specs(file_path) + else: + raise ValueError("No JSON file found in the PSS home.") + file_path = os.path.join(pss_home, 'private-search-set.bloom') + pss.load_bf_from_file(file_path) + file_path = os.path.join(pss_home, 'private-search-set.pss') + pss._ps = pss.load_pss_from_file(file_path) + return pss + + def load_bf_from_file(self, file_path): + if os.path.exists(file_path): + with open(file_path, 'rb') as f: + self._bf.load(f) + + def load_pss_from_file(self, file_path): + if os.path.exists(file_path): + with open(file_path, 'r') as f: + return set(f.read().splitlines()) + else: + return None + + def init_filter_and_set(self): + # init bloom filter if self.bloomfilter['format'] == 'dcso-v1': - self._bf = BloomFilter(n=self.bloomfilter['capacity'], p=self.bloomfilter['fp-probability']) + self._bf = BloomFilterDCSO(self.bloomfilter) else: raise ValueError("Bloomfilter format not supported.") + + # init the private search set + self._ps = set() def ingest_stdin(self): # Read bytes from stdin - self._ps = set() for line in sys.stdin.buffer.read().splitlines(): - if self.bloomfilter['format'] == 'dcso-v1': - self._bf.add(line) - if self.algorithm == 'Blake2': - # TODO use a salt - self._ps.add(hashlib.blake2b(line, key=self.keyid.encode()).hexdigest()) + self.ingest(line) + + def ingest(self, data): + # HMAC the data + hashed = b'' + if self.algorithm == 'Blake2': + # TODO Use a salt + # TODO We use the keyid as the key for the time being + hashed_string = hashlib.blake2b(data, key=self.keyid.encode()).hexdigest() + hashed_bytes = hashed_string.encode() + else: + raise ValueError("HMAC algorithm not supported.") + + # add the string digest to the private search set + self._ps.add(hashed_string) + # add the utf8 encoded bytes representation of the hexdigest to the bloom filter + if self.bloomfilter['format'] == 'dcso-v1': + self._bf.add(hashed_bytes) + + def check_stdin(self): + # Read bytes from stdin + for line in sys.stdin.buffer.read().splitlines(): + # check hashset in priority + if self._ps != None: + if self.check_pss(line): + print(line) + elif self._bf.loaded: + if self.check_bf(line): + print(line) + else: + raise ValueError("No private search set or bloom filter loaded.") + + def check_pss(self, data): + # HMAC the data + hashed = b'' + if self.algorithm == 'Blake2': + # TODO Use a salt + # TODO We use the keyid as the key for the time being + hashed_string = hashlib.blake2b(data, key=self.keyid.encode()).hexdigest() + else: + raise ValueError("HMAC algorithm not supported.") + if hashed_string in self._ps: + return True + else: + return False + + def check_bf(self, data): + # HMAC the data + hashed_bytes = b'' + if self.algorithm == 'Blake2': + # TODO Use a salt + # TODO We use the keyid as the key for the time being + hashed_bytes = hashlib.blake2b(data, key=self.keyid.encode()).hexdigest().encode() + else: + raise ValueError("HMAC algorithm not supported.") + + if self.bloomfilter['format'] == 'dcso-v1': + return self._bf.check(hashed_bytes) + else: + raise ValueError("Bloomfilter format not supported.") + def write_to_files(self, pss_home): if not os.path.exists(pss_home):