From d2d06a09b8201d05feecd9e9cd3167f6bb6d3045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Vask=C3=B3?= Date: Wed, 30 Nov 2022 00:31:38 +0100 Subject: [PATCH] hyperscan: replacing wrapper --- .github/actions/setup-dependencies/action.yml | 4 +- Dockerfile | 2 - default.nix | 16 ----- poetry.lock | 33 ++++----- pyproject.toml | 2 +- tests/test_finder.py | 26 +++---- unblob/finder.py | 72 +++++++------------ unblob/handlers/compression/bzip2.py | 45 +++++------- unblob/handlers/compression/xz.py | 48 +++++-------- 9 files changed, 88 insertions(+), 160 deletions(-) diff --git a/.github/actions/setup-dependencies/action.yml b/.github/actions/setup-dependencies/action.yml index 378e3bcd92..80b532df97 100644 --- a/.github/actions/setup-dependencies/action.yml +++ b/.github/actions/setup-dependencies/action.yml @@ -1,13 +1,13 @@ inputs: python-version: - description: 'Python version to setup' + description: "Python version to setup" required: false default: 3.8 runs: using: "composite" steps: - name: Install 3rd party from apt - run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg libhyperscan5 libhyperscan-dev zstd + run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg zstd shell: bash - name: Install sasquatch diff --git a/Dockerfile b/Dockerfile index 1d3c96fc0d..26cb285d00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,8 +21,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ xz-utils \ zlib1g-dev \ libmagic1 \ - libhyperscan5 \ - libhyperscan-dev \ zstd RUN curl -L -o sasquatch_1.0_amd64.deb https://github.com/onekey-sec/sasquatch/releases/download/sasquatch-v1.0/sasquatch_1.0_amd64.deb \ && dpkg -i sasquatch_1.0_amd64.deb \ diff --git a/default.nix b/default.nix index c0628e13b3..49c7c4920e 100644 --- a/default.nix +++ b/default.nix @@ -16,7 +16,6 @@ , simg2img , unar , file -, hyperscan , zstd }: @@ -72,21 +71,6 @@ let ''; }); - hyperscan = super.hyperscan.overridePythonAttrs (_: { - buildInputs = [ - hyperscan - self.poetry - self.setuptools - ]; - nativeBuildInputs = [ - pkg-config - ]; - - installPhase = '' - ${self.python.pythonForBuild.interpreter} -m pip install --no-build-isolation --no-index --prefix=$out --ignore-installed --no-dependencies --no-cache . - ''; - }); - arpy = overrideWithSetuptools super.arpy { }; yaffshiv = overrideWithSetuptools super.yaffshiv { }; ubi-reader = overrideWithSetuptools super.ubi-reader { }; diff --git a/poetry.lock b/poetry.lock index 73356815d5..81a7efe2da 100644 --- a/poetry.lock +++ b/poetry.lock @@ -138,14 +138,6 @@ python-dateutil = ">=2.8.1" [package.extras] dev = ["wheel", "flake8", "markdown", "twine"] -[[package]] -name = "hyperscan" -version = "0.2.0" -description = "Python bindings for Hyperscan." -category = "main" -optional = false -python-versions = ">=3.6.1,<4.0" - [[package]] name = "identify" version = "2.5.3" @@ -443,6 +435,17 @@ python-versions = ">=3.6.8" [package.extras] diagrams = ["railroad-diagrams", "jinja2"] +[[package]] +name = "pyperscan" +version = "0.1.0" +description = "" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.extras] +test = ["mypy (>=0.991,<1.0)", "pytest (>=7.0.0,<7.1.0)"] + [[package]] name = "pyright" version = "0.0.12" @@ -724,7 +727,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "8dd6bf0e0f37c704f198f69a44bf6f771cf389bc3d5870a4ab3ab93c9bc317fe" +content-hash = "1417d2dc68de89b38cda34dcbf0e4af4e2b8a196f160f01be502aef010a51648" [metadata.files] arpy = [ @@ -829,13 +832,6 @@ ghp-import = [ {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, ] -hyperscan = [ - {file = "hyperscan-0.2.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:47ef10b4297f9976d257b7f260ae4ae8834e87e1abb7f46cf0707ba496fb6e49"}, - {file = "hyperscan-0.2.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:78e97de71896b9fda4368c185e6609e53bb240c85302909ac46e738f14621f40"}, - {file = "hyperscan-0.2.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:794eecc13fa9bcf061004340582aab342471fb22b710f92020e3ea508776ff53"}, - {file = "hyperscan-0.2.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:fd0d0fe64484443b9e5ee1e8b156a30f2ba91494b1343fae5c943130ff847607"}, - {file = "hyperscan-0.2.0.tar.gz", hash = "sha256:10cb8939d7db85d522ed319031ff5ab86fd0133126b986290f01aa83dbfb9ff7"}, -] identify = [ {file = "identify-2.5.3-py2.py3-none-any.whl", hash = "sha256:25851c8c1370effb22aaa3c987b30449e9ff0cece408f810ae6ce408fdd20893"}, {file = "identify-2.5.3.tar.gz", hash = "sha256:887e7b91a1be152b0d46bbf072130235a8117392b9f1828446079a816a05ef44"}, @@ -1016,6 +1012,11 @@ pyparsing = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] +pyperscan = [ + {file = "pyperscan-0.1.0-cp38-abi3-macosx_10_9_x86_64.macosx_10_9_arm64.macosx_10_9_universal2.whl", hash = "sha256:d036c3465e6d7e47dec2c494a7aa3f888d4749156b57738fdae174d34951ba1c"}, + {file = "pyperscan-0.1.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1497035ece80a10ac0faae3f9122ff73c5d2eff20af8c7353b9a964695779fb8"}, + {file = "pyperscan-0.1.0.tar.gz", hash = "sha256:cd557e2e97a3a99cacaba1a022fc47a23832da33a1be46cad8b49d4c244e4945"}, +] pyright = [ {file = "pyright-0.0.12-py3-none-any.whl", hash = "sha256:2c829291e23589ce52f2051e9c46830bf067a620caef95818200fa5d0cc2527f"}, {file = "pyright-0.0.12.tar.gz", hash = "sha256:a80003bc10e6400ee235b77bf30f87cb442611e8bd6f496fe4afd4ee44cda580"}, diff --git a/pyproject.toml b/pyproject.toml index 255ad95cd4..09fa3de118 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ yaffshiv = { git = "https://github.com/onekey-sec/yaffshiv.git", rev = "a8f21283 plotext = "^4.1.5" pluggy = "^1.0.0" python-magic = "^0.4.27" -hyperscan = "0.2.0" +pyperscan = "^0.1.0" lark = "^1.1.2" lz4 = "^4.0.0" lief = "^0.12.3" diff --git a/tests/test_finder.py b/tests/test_finder.py index 8dbd47117e..4d098effe0 100644 --- a/tests/test_finder.py +++ b/tests/test_finder.py @@ -1,5 +1,6 @@ import attr import pytest +from pyperscan import Scan from unblob.file_utils import InvalidInputFormat from unblob.finder import build_hyperscan_database, search_chunks @@ -58,33 +59,26 @@ def calculate_chunk(self, file, start_offset: int): def test_build_hyperscan_database(): - db, handler_map = build_hyperscan_database((TestHandlerA, TestHandlerB)) + db = build_hyperscan_database((TestHandlerA, TestHandlerB)) matches = [] - db.scan( - [bytearray(b"A123456789BB")], - match_event_handler=lambda pattern_id, start, end, flags, m: m.append( - (pattern_id, start, end) - ), - context=matches, - ) - assert len(handler_map) == 3 + def on_match(m, pattern_id, start, end): + m.append((pattern_id, start, end)) + return Scan.Continue + + db.build(matches, on_match).scan(b"A123456789BB") assert len(matches) == 2 - assert isinstance(handler_map[matches[0][0]], TestHandlerA) - assert isinstance(handler_map[matches[1][0]], TestHandlerB) assert matches[0][1] == 0 assert matches[1][1] == 10 def test_db_and_handler_map_instances_are_cached(): - db1, handler_map1 = build_hyperscan_database((TestHandlerA, TestHandlerB)) - db2, handler_map2 = build_hyperscan_database((TestHandlerA, TestHandlerB)) - db3, handler_map3 = build_hyperscan_database((TestHandlerA,)) + db1 = build_hyperscan_database((TestHandlerA, TestHandlerB)) + db2 = build_hyperscan_database((TestHandlerA, TestHandlerB)) + db3 = build_hyperscan_database((TestHandlerA,)) assert db1 is db2 - assert handler_map1 is handler_map2 assert db1 is not db3 - assert handler_map1 is not handler_map3 def test_invalid_hexstring_pattern_raises(): diff --git a/unblob/finder.py b/unblob/finder.py index 7c9aba5d15..3bf5a3f808 100644 --- a/unblob/finder.py +++ b/unblob/finder.py @@ -2,12 +2,11 @@ Searching Chunk related functions. The main "entry point" is search_chunks_by_priority. """ -from enum import Flag from functools import lru_cache -from typing import Dict, List, Optional, Tuple +from typing import List, Optional import attr -import hyperscan +from pyperscan import BlockDatabase, Flag, Pattern, Scan from structlog import get_logger from .file_utils import InvalidInputFormat, SeekError @@ -21,18 +20,12 @@ @attr.define class HyperscanMatchContext: - handler_map: Dict[int, Handler] file: File file_size: int all_chunks: List task_result: TaskResult -class _HyperscanScan(Flag): - Continue = False - Terminate = True - - def _calculate_chunk( handler: Handler, file: File, real_offset, task_result: TaskResult ) -> Optional[ValidChunk]: @@ -74,13 +67,12 @@ def _calculate_chunk( def _hyperscan_match( - pattern_id: int, offset: int, end: int, flags: int, context: HyperscanMatchContext -) -> _HyperscanScan: - handler = context.handler_map[pattern_id] + context: HyperscanMatchContext, handler: Handler, offset: int, end: int +) -> Scan: real_offset = offset + handler.PATTERN_MATCH_OFFSET if real_offset < 0: - return _HyperscanScan.Continue + return Scan.Continue # Skip chunk calculation if this would start inside another one, # similar to remove_inner_chunks, but before we even begin calculating. @@ -91,7 +83,7 @@ def _hyperscan_match( offset=real_offset, _verbosity=2, ) - return _HyperscanScan.Continue + return Scan.Continue logger.debug( "Calculating chunk for pattern match", @@ -104,11 +96,11 @@ def _hyperscan_match( # We found some random bytes this handler couldn't parse if chunk is None: - return _HyperscanScan.Continue + return Scan.Continue if chunk.end_offset > context.file_size: logger.debug("Chunk overflows file", chunk=chunk, _verbosity=2) - return _HyperscanScan.Continue + return Scan.Continue chunk.handler = handler logger.debug("Found valid chunk", chunk=chunk, handler=handler.NAME, _verbosity=2) @@ -117,9 +109,9 @@ def _hyperscan_match( # Terminate scan if we match till the end of the file if chunk.end_offset == context.file_size: logger.debug("Chunk covers till end of the file", chunk=chunk) - return _HyperscanScan.Terminate + return Scan.Terminate - return _HyperscanScan.Continue + return Scan.Continue def search_chunks( # noqa: C901 @@ -135,33 +127,28 @@ def search_chunks( # noqa: C901 """ all_chunks = [] - hyperscan_db, handler_map = build_hyperscan_database(handlers) + hyperscan_db = build_hyperscan_database(handlers) hyperscan_context = HyperscanMatchContext( - handler_map=handler_map, file=file, file_size=file_size, all_chunks=all_chunks, task_result=task_result, ) + scanner = hyperscan_db.build(hyperscan_context, _hyperscan_match) + try: - hyperscan_db.scan( - [file], - match_event_handler=_hyperscan_match, - context=hyperscan_context, - ) - except hyperscan.error as e: - if e.args and e.args[0] == f"error code {hyperscan.HS_SCAN_TERMINATED}": + if scanner.scan(file) == Scan.Terminate: logger.debug( "Scanning terminated as chunk matches till end of file", ) return all_chunks - else: - logger.error( - "Error scanning for patterns", - error=e, - ) + except Exception as e: + logger.error( + "Error scanning for patterns", + error=e, + ) logger.debug( "Ended searching for chunks", @@ -172,21 +159,18 @@ def search_chunks( # noqa: C901 @lru_cache -def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Dict]: - db = hyperscan.Database(mode=hyperscan.HS_MODE_VECTORED) - handler_map = dict() - - pattern_id = 0 +def build_hyperscan_database(handlers: Handlers): patterns = [] for handler_class in handlers: handler = handler_class() for pattern in handler.PATTERNS: try: patterns.append( - ( + Pattern( pattern.as_regex(), - pattern_id, - hyperscan.HS_FLAG_SOM_LEFTMOST | hyperscan.HS_FLAG_DOTALL, + Flag.SOM_LEFTMOST, + Flag.DOTALL, + tag=handler, ) ) except InvalidHexString as e: @@ -197,10 +181,4 @@ def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Di error=str(e), ) raise - handler_map[pattern_id] = handler - pattern_id += 1 - - expressions, ids, flags = zip(*patterns) - db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) - - return db, handler_map + return BlockDatabase(*patterns) diff --git a/unblob/handlers/compression/bzip2.py b/unblob/handlers/compression/bzip2.py index a3f2092b37..58a1245185 100644 --- a/unblob/handlers/compression/bzip2.py +++ b/unblob/handlers/compression/bzip2.py @@ -1,7 +1,7 @@ from typing import Optional import attr -import hyperscan +from pyperscan import BlockDatabase, Flag, Pattern, Scan from structlog import get_logger from unblob.extractors import Command @@ -54,20 +54,9 @@ def build_stream_end_scan_db(pattern_list): - patterns = [] - for pattern_id, pattern in enumerate(pattern_list): - patterns.append( - ( - pattern.as_regex(), - pattern_id, - hyperscan.HS_FLAG_SOM_LEFTMOST | hyperscan.HS_FLAG_DOTALL, - ) - ) - - expressions, ids, flags = zip(*patterns) - db = hyperscan.Database(mode=hyperscan.HS_MODE_VECTORED) - db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) - return db + return BlockDatabase( + *(Pattern(p.as_regex(), Flag.SOM_LEFTMOST, Flag.DOTALL) for p in pattern_list) + ) hyperscan_stream_end_magic_db = build_stream_end_scan_db(STREAM_END_MAGIC_PATTERNS) @@ -104,11 +93,11 @@ def _validate_block_header(file: File): def _hyperscan_match( - pattern_id: int, offset: int, end: int, flags: int, context: Bzip2SearchContext -) -> bool: + context: Bzip2SearchContext, pattern_id: int, offset: int, end: int +) -> Scan: # Ignore any match before the start of this chunk if offset < context.start_offset: - return False + return Scan.Continue last_block_end = offset + STREAM_FOOTER_SIZE if pattern_id > 3: @@ -118,16 +107,16 @@ def _hyperscan_match( try: context.file.seek(last_block_end) except SeekError: - return True + return Scan.Terminate context.end_block_offset = last_block_end # Check if there is a next stream starting after the end of this stream # and try to continue processing that as well if _validate_stream_header(context.file) and _validate_block_header(context.file): - return False + return Scan.Continue else: - return True + return Scan.Terminate class BZip2Handler(Handler): @@ -148,16 +137,14 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] context = Bzip2SearchContext( start_offset=start_offset, file=file, end_block_offset=-1 ) + try: - hyperscan_stream_end_magic_db.scan( - [file], match_event_handler=_hyperscan_match, context=context + hyperscan_stream_end_magic_db.build(context, _hyperscan_match).scan(file) + except Exception as e: + logger.debug( + "Error scanning for bzip2 patterns", + error=e, ) - except hyperscan.error as e: - if e.args and e.args[0] != f"error code {hyperscan.HS_SCAN_TERMINATED}": - logger.debug( - "Error scanning for bzip2 patterns", - error=e, - ) if context.end_block_offset > 0: return ValidChunk( diff --git a/unblob/handlers/compression/xz.py b/unblob/handlers/compression/xz.py index b0cda505c3..f44448c414 100644 --- a/unblob/handlers/compression/xz.py +++ b/unblob/handlers/compression/xz.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple import attr -import hyperscan +from pyperscan import BlockDatabase, Flag, Pattern, Scan from structlog import get_logger from unblob.extractors import Command @@ -52,20 +52,9 @@ def build_stream_end_scan_db(pattern_list): - patterns = [] - for pattern_id, pattern in enumerate(pattern_list): - patterns.append( - ( - pattern.as_regex(), - pattern_id, - hyperscan.HS_FLAG_SOM_LEFTMOST | hyperscan.HS_FLAG_DOTALL, - ) - ) - - expressions, ids, flags = zip(*patterns) - db = hyperscan.Database(mode=hyperscan.HS_MODE_VECTORED) - db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) - return db + return BlockDatabase( + *(Pattern(p.as_regex(), Flag.SOM_LEFTMOST, Flag.DOTALL) for p in pattern_list) + ) hyperscan_stream_end_magic_db = build_stream_end_scan_db(STREAM_END_MAGIC_PATTERNS) @@ -136,21 +125,21 @@ def get_stream_size(footer_offset: int, file: File) -> int: def _hyperscan_match( - pattern_id: int, offset: int, end: int, flags: int, context: XZSearchContext -) -> bool: + context: XZSearchContext, pattern_id: int, offset: int, end: int +) -> Scan: # if we matched before our start offset, continue looking end_offset = offset + FLAG_LEN + EOS_MAGIC_LEN if end_offset < context.start_offset: - return False + return Scan.Continue try: stream_size = get_stream_size(offset, context.file) except InvalidInputFormat: - return False + return Scan.Continue # stream_size does not match, we continue our search if stream_size != (end_offset - context.start_offset): - return False + return Scan.Continue # stream padding validation # padding MUST contain only null bytes and be 4 bytes aligned @@ -159,7 +148,7 @@ def _hyperscan_match( padding_size = end_padding_offset - end_offset if padding_size % 4 != 0: context.end_streams_offset = end_offset - return False + return Scan.Continue # next magic validation context.end_streams_offset = end_padding_offset @@ -167,8 +156,8 @@ def _hyperscan_match( magic = context.file.read(len(STREAM_START_MAGIC)) if magic == STREAM_START_MAGIC: context.start_offset = end_padding_offset - return False - return True + return Scan.Continue + return Scan.Terminate class XZHandler(Handler): @@ -193,15 +182,12 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] ) try: - hyperscan_stream_end_magic_db.scan( - [file], match_event_handler=_hyperscan_match, context=context + hyperscan_stream_end_magic_db.build(context, _hyperscan_match).scan(file) + except Exception as e: + logger.debug( + "Error scanning for xz patterns", + error=e, ) - except hyperscan.error as e: - if e.args and e.args[0] != f"error code {hyperscan.HS_SCAN_TERMINATED}": - logger.debug( - "Error scanning for xz patterns", - error=e, - ) if context.end_streams_offset > 0: return ValidChunk( start_offset=start_offset, end_offset=context.end_streams_offset