Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hyperscan: replacing wrapper #475

Merged
merged 1 commit into from
Dec 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/actions/setup-dependencies/action.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
inputs:
python-version:
description: 'Python version to setup'
description: "Python version to setup"
required: false
default: 3.8
runs:
using: "composite"
steps:
- name: Install 3rd party from apt
run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg libhyperscan5 libhyperscan-dev zstd
run: sudo apt install e2fsprogs p7zip-full unar zlib1g-dev liblzo2-dev lz4 lzop lziprecover img2simg zstd
shell: bash

- name: Install sasquatch
Expand Down
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
xz-utils \
zlib1g-dev \
libmagic1 \
libhyperscan5 \
libhyperscan-dev \
zstd
RUN curl -L -o sasquatch_1.0_amd64.deb https://github.com/onekey-sec/sasquatch/releases/download/sasquatch-v1.0/sasquatch_1.0_amd64.deb \
&& dpkg -i sasquatch_1.0_amd64.deb \
Expand Down
16 changes: 0 additions & 16 deletions default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
, simg2img
, unar
, file
, hyperscan
, zstd
}:

Expand Down Expand Up @@ -72,21 +71,6 @@ let
'';
});

hyperscan = super.hyperscan.overridePythonAttrs (_: {
buildInputs = [
hyperscan
self.poetry
self.setuptools
];
nativeBuildInputs = [
pkg-config
];

installPhase = ''
${self.python.pythonForBuild.interpreter} -m pip install --no-build-isolation --no-index --prefix=$out --ignore-installed --no-dependencies --no-cache .
'';
});

arpy = overrideWithSetuptools super.arpy { };
yaffshiv = overrideWithSetuptools super.yaffshiv { };
ubi-reader = overrideWithSetuptools super.ubi-reader { };
Expand Down
33 changes: 17 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ yaffshiv = { git = "https://github.com/onekey-sec/yaffshiv.git", rev = "a8f21283
plotext = "^4.1.5"
pluggy = "^1.0.0"
python-magic = "^0.4.27"
hyperscan = "0.2.0"
pyperscan = "^0.1.0"
lark = "^1.1.2"
lz4 = "^4.0.0"
lief = "^0.12.3"
Expand Down
26 changes: 10 additions & 16 deletions tests/test_finder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import attr
import pytest
from pyperscan import Scan

from unblob.file_utils import InvalidInputFormat
from unblob.finder import build_hyperscan_database, search_chunks
Expand Down Expand Up @@ -58,33 +59,26 @@ def calculate_chunk(self, file, start_offset: int):


def test_build_hyperscan_database():
db, handler_map = build_hyperscan_database((TestHandlerA, TestHandlerB))
db = build_hyperscan_database((TestHandlerA, TestHandlerB))
matches = []
db.scan(
[bytearray(b"A123456789BB")],
match_event_handler=lambda pattern_id, start, end, flags, m: m.append(
(pattern_id, start, end)
),
context=matches,
)

assert len(handler_map) == 3
def on_match(m, pattern_id, start, end):
m.append((pattern_id, start, end))
return Scan.Continue

db.build(matches, on_match).scan(b"A123456789BB")

assert len(matches) == 2
assert isinstance(handler_map[matches[0][0]], TestHandlerA)
assert isinstance(handler_map[matches[1][0]], TestHandlerB)
assert matches[0][1] == 0
assert matches[1][1] == 10


def test_db_and_handler_map_instances_are_cached():
db1, handler_map1 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db2, handler_map2 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db3, handler_map3 = build_hyperscan_database((TestHandlerA,))
db1 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db2 = build_hyperscan_database((TestHandlerA, TestHandlerB))
db3 = build_hyperscan_database((TestHandlerA,))
assert db1 is db2
assert handler_map1 is handler_map2
assert db1 is not db3
assert handler_map1 is not handler_map3


def test_invalid_hexstring_pattern_raises():
Expand Down
72 changes: 25 additions & 47 deletions unblob/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
Searching Chunk related functions.
The main "entry point" is search_chunks_by_priority.
"""
from enum import Flag
from functools import lru_cache
from typing import Dict, List, Optional, Tuple
from typing import List, Optional

import attr
import hyperscan
from pyperscan import BlockDatabase, Flag, Pattern, Scan
from structlog import get_logger

from .file_utils import InvalidInputFormat, SeekError
Expand All @@ -21,18 +20,12 @@

@attr.define
class HyperscanMatchContext:
handler_map: Dict[int, Handler]
file: File
file_size: int
all_chunks: List
task_result: TaskResult


class _HyperscanScan(Flag):
Continue = False
Terminate = True


def _calculate_chunk(
handler: Handler, file: File, real_offset, task_result: TaskResult
) -> Optional[ValidChunk]:
Expand Down Expand Up @@ -74,13 +67,12 @@ def _calculate_chunk(


def _hyperscan_match(
pattern_id: int, offset: int, end: int, flags: int, context: HyperscanMatchContext
) -> _HyperscanScan:
handler = context.handler_map[pattern_id]
context: HyperscanMatchContext, handler: Handler, offset: int, end: int
) -> Scan:
real_offset = offset + handler.PATTERN_MATCH_OFFSET

if real_offset < 0:
return _HyperscanScan.Continue
return Scan.Continue

# Skip chunk calculation if this would start inside another one,
# similar to remove_inner_chunks, but before we even begin calculating.
Expand All @@ -91,7 +83,7 @@ def _hyperscan_match(
offset=real_offset,
_verbosity=2,
)
return _HyperscanScan.Continue
return Scan.Continue

logger.debug(
"Calculating chunk for pattern match",
Expand All @@ -104,11 +96,11 @@ def _hyperscan_match(

# We found some random bytes this handler couldn't parse
if chunk is None:
return _HyperscanScan.Continue
return Scan.Continue

if chunk.end_offset > context.file_size:
logger.debug("Chunk overflows file", chunk=chunk, _verbosity=2)
return _HyperscanScan.Continue
return Scan.Continue

chunk.handler = handler
logger.debug("Found valid chunk", chunk=chunk, handler=handler.NAME, _verbosity=2)
Expand All @@ -117,9 +109,9 @@ def _hyperscan_match(
# Terminate scan if we match till the end of the file
if chunk.end_offset == context.file_size:
logger.debug("Chunk covers till end of the file", chunk=chunk)
return _HyperscanScan.Terminate
return Scan.Terminate

return _HyperscanScan.Continue
return Scan.Continue


def search_chunks( # noqa: C901
Expand All @@ -135,33 +127,28 @@ def search_chunks( # noqa: C901
"""
all_chunks = []

hyperscan_db, handler_map = build_hyperscan_database(handlers)
hyperscan_db = build_hyperscan_database(handlers)

hyperscan_context = HyperscanMatchContext(
handler_map=handler_map,
file=file,
file_size=file_size,
all_chunks=all_chunks,
task_result=task_result,
)

scanner = hyperscan_db.build(hyperscan_context, _hyperscan_match)

try:
hyperscan_db.scan(
[file],
match_event_handler=_hyperscan_match,
context=hyperscan_context,
)
except hyperscan.error as e:
if e.args and e.args[0] == f"error code {hyperscan.HS_SCAN_TERMINATED}":
if scanner.scan(file) == Scan.Terminate:
logger.debug(
"Scanning terminated as chunk matches till end of file",
)
return all_chunks
else:
logger.error(
"Error scanning for patterns",
error=e,
)
except Exception as e:
logger.error(
"Error scanning for patterns",
error=e,
)

logger.debug(
"Ended searching for chunks",
Expand All @@ -172,21 +159,18 @@ def search_chunks( # noqa: C901


@lru_cache
def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Dict]:
db = hyperscan.Database(mode=hyperscan.HS_MODE_VECTORED)
handler_map = dict()

pattern_id = 0
def build_hyperscan_database(handlers: Handlers):
patterns = []
for handler_class in handlers:
handler = handler_class()
for pattern in handler.PATTERNS:
try:
patterns.append(
(
Pattern(
pattern.as_regex(),
pattern_id,
hyperscan.HS_FLAG_SOM_LEFTMOST | hyperscan.HS_FLAG_DOTALL,
Flag.SOM_LEFTMOST,
Flag.DOTALL,
tag=handler,
)
)
except InvalidHexString as e:
Expand All @@ -197,10 +181,4 @@ def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Di
error=str(e),
)
raise
handler_map[pattern_id] = handler
pattern_id += 1

expressions, ids, flags = zip(*patterns)
db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags)

return db, handler_map
return BlockDatabase(*patterns)
Loading