Skip to content

Commit

Permalink
Merge pull request #475 from onekey-sec/pyperscan
Browse files Browse the repository at this point in the history
hyperscan: replacing wrapper
  • Loading branch information
qkaiser authored Dec 5, 2022
2 parents 414bb7a + d2d06a0 commit 4982c33
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 160 deletions.
4 changes: 2 additions & 2 deletions .github/actions/setup-dependencies/action.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
inputs:
python-version:
description: 'Python version to setup'
description: "Python version to setup"
required: false
default: 3.8
runs:
using: "composite"
steps:
- name: Install 3rd party from apt
run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg libhyperscan5 libhyperscan-dev zstd
run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg zstd
shell: bash

- name: Install sasquatch
Expand Down
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
xz-utils \
zlib1g-dev \
libmagic1 \
libhyperscan5 \
libhyperscan-dev \
zstd
RUN curl -L -o sasquatch_1.0_amd64.deb https://github.com/onekey-sec/sasquatch/releases/download/sasquatch-v1.0/sasquatch_1.0_amd64.deb \
&& dpkg -i sasquatch_1.0_amd64.deb \
Expand Down
16 changes: 0 additions & 16 deletions default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
, simg2img
, unar
, file
, hyperscan
, zstd
}:

Expand Down Expand Up @@ -72,21 +71,6 @@ let
'';
});

hyperscan = super.hyperscan.overridePythonAttrs (_: {
buildInputs = [
hyperscan
self.poetry
self.setuptools
];
nativeBuildInputs = [
pkg-config
];

installPhase = ''
${self.python.pythonForBuild.interpreter} -m pip install --no-build-isolation --no-index --prefix=$out --ignore-installed --no-dependencies --no-cache .
'';
});

arpy = overrideWithSetuptools super.arpy { };
yaffshiv = overrideWithSetuptools super.yaffshiv { };
ubi-reader = overrideWithSetuptools super.ubi-reader { };
Expand Down
33 changes: 17 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ yaffshiv = { git = "https://github.com/onekey-sec/yaffshiv.git", rev = "a8f21283
plotext = "^4.1.5"
pluggy = "^1.0.0"
python-magic = "^0.4.27"
hyperscan = "0.2.0"
pyperscan = "^0.1.0"
lark = "^1.1.2"
lz4 = "^4.0.0"
lief = "^0.12.3"
Expand Down
26 changes: 10 additions & 16 deletions tests/test_finder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import attr
import pytest
from pyperscan import Scan

from unblob.file_utils import InvalidInputFormat
from unblob.finder import build_hyperscan_database, search_chunks
Expand Down Expand Up @@ -58,33 +59,26 @@ def calculate_chunk(self, file, start_offset: int):


def test_build_hyperscan_database():
db, handler_map = build_hyperscan_database((TestHandlerA, TestHandlerB))
db = build_hyperscan_database((TestHandlerA, TestHandlerB))
matches = []
db.scan(
[bytearray(b"A123456789BB")],
match_event_handler=lambda pattern_id, start, end, flags, m: m.append(
(pattern_id, start, end)
),
context=matches,
)

assert len(handler_map) == 3
def on_match(m, pattern_id, start, end):
m.append((pattern_id, start, end))
return Scan.Continue

db.build(matches, on_match).scan(b"A123456789BB")

assert len(matches) == 2
assert isinstance(handler_map[matches[0][0]], TestHandlerA)
assert isinstance(handler_map[matches[1][0]], TestHandlerB)
assert matches[0][1] == 0
assert matches[1][1] == 10


def test_db_and_handler_map_instances_are_cached():
db1, handler_map1 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db2, handler_map2 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db3, handler_map3 = build_hyperscan_database((TestHandlerA,))
db1 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db2 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db3 = build_hyperscan_database((TestHandlerA,))
assert db1 is db2
assert handler_map1 is handler_map2
assert db1 is not db3
assert handler_map1 is not handler_map3


def test_invalid_hexstring_pattern_raises():
Expand Down
72 changes: 25 additions & 47 deletions unblob/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
Searching Chunk related functions.
The main "entry point" is search_chunks_by_priority.
"""
from enum import Flag
from functools import lru_cache
from typing import Dict, List, Optional, Tuple
from typing import List, Optional

import attr
import hyperscan
from pyperscan import BlockDatabase, Flag, Pattern, Scan
from structlog import get_logger

from .file_utils import InvalidInputFormat, SeekError
Expand All @@ -21,18 +20,12 @@

@attr.define
class HyperscanMatchContext:
handler_map: Dict[int, Handler]
file: File
file_size: int
all_chunks: List
task_result: TaskResult


class _HyperscanScan(Flag):
Continue = False
Terminate = True


def _calculate_chunk(
handler: Handler, file: File, real_offset, task_result: TaskResult
) -> Optional[ValidChunk]:
Expand Down Expand Up @@ -74,13 +67,12 @@ def _calculate_chunk(


def _hyperscan_match(
pattern_id: int, offset: int, end: int, flags: int, context: HyperscanMatchContext
) -> _HyperscanScan:
handler = context.handler_map[pattern_id]
context: HyperscanMatchContext, handler: Handler, offset: int, end: int
) -> Scan:
real_offset = offset + handler.PATTERN_MATCH_OFFSET

if real_offset < 0:
return _HyperscanScan.Continue
return Scan.Continue

# Skip chunk calculation if this would start inside another one,
# similar to remove_inner_chunks, but before we even begin calculating.
Expand All @@ -91,7 +83,7 @@ def _hyperscan_match(
offset=real_offset,
_verbosity=2,
)
return _HyperscanScan.Continue
return Scan.Continue

logger.debug(
"Calculating chunk for pattern match",
Expand All @@ -104,11 +96,11 @@ def _hyperscan_match(

# We found some random bytes this handler couldn't parse
if chunk is None:
return _HyperscanScan.Continue
return Scan.Continue

if chunk.end_offset > context.file_size:
logger.debug("Chunk overflows file", chunk=chunk, _verbosity=2)
return _HyperscanScan.Continue
return Scan.Continue

chunk.handler = handler
logger.debug("Found valid chunk", chunk=chunk, handler=handler.NAME, _verbosity=2)
Expand All @@ -117,9 +109,9 @@ def _hyperscan_match(
# Terminate scan if we match till the end of the file
if chunk.end_offset == context.file_size:
logger.debug("Chunk covers till end of the file", chunk=chunk)
return _HyperscanScan.Terminate
return Scan.Terminate

return _HyperscanScan.Continue
return Scan.Continue


def search_chunks( # noqa: C901
Expand All @@ -135,33 +127,28 @@ def search_chunks( # noqa: C901
"""
all_chunks = []

hyperscan_db, handler_map = build_hyperscan_database(handlers)
hyperscan_db = build_hyperscan_database(handlers)

hyperscan_context = HyperscanMatchContext(
handler_map=handler_map,
file=file,
file_size=file_size,
all_chunks=all_chunks,
task_result=task_result,
)

scanner = hyperscan_db.build(hyperscan_context, _hyperscan_match)

try:
hyperscan_db.scan(
[file],
match_event_handler=_hyperscan_match,
context=hyperscan_context,
)
except hyperscan.error as e:
if e.args and e.args[0] == f"error code {hyperscan.HS_SCAN_TERMINATED}":
if scanner.scan(file) == Scan.Terminate:
logger.debug(
"Scanning terminated as chunk matches till end of file",
)
return all_chunks
else:
logger.error(
"Error scanning for patterns",
error=e,
)
except Exception as e:
logger.error(
"Error scanning for patterns",
error=e,
)

logger.debug(
"Ended searching for chunks",
Expand All @@ -172,21 +159,18 @@ def search_chunks( # noqa: C901


@lru_cache
def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Dict]:
db = hyperscan.Database(mode=hyperscan.HS_MODE_VECTORED)
handler_map = dict()

pattern_id = 0
def build_hyperscan_database(handlers: Handlers):
patterns = []
for handler_class in handlers:
handler = handler_class()
for pattern in handler.PATTERNS:
try:
patterns.append(
(
Pattern(
pattern.as_regex(),
pattern_id,
hyperscan.HS_FLAG_SOM_LEFTMOST | hyperscan.HS_FLAG_DOTALL,
Flag.SOM_LEFTMOST,
Flag.DOTALL,
tag=handler,
)
)
except InvalidHexString as e:
Expand All @@ -197,10 +181,4 @@ def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Di
error=str(e),
)
raise
handler_map[pattern_id] = handler
pattern_id += 1

expressions, ids, flags = zip(*patterns)
db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags)

return db, handler_map
return BlockDatabase(*patterns)
Loading

0 comments on commit 4982c33

Please sign in to comment.