Skip to content

Commit

Permalink
Merge pull request #18 from glmcdona/glmcdona/improve_tests
Browse files Browse the repository at this point in the history
Improve unit tests, require min_chars be 1 or larger
  • Loading branch information
glmcdona authored Jul 12, 2023
2 parents 080a585 + aa4172e commit 63e7f6c
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 1 deletion.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup, Extension, find_packages

__version__ = "0.1.8"
__version__ = "0.1.11"

ext_modules = [
Pybind11Extension("binary2strings",
Expand Down
6 changes: 6 additions & 0 deletions src/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ PYBIND11_MODULE(binary2strings, m)

if(info.ndim != 1)
throw std::runtime_error("Only 1-dimensional arrays are supported");

if(min_chars < 1)
throw std::runtime_error("min_chars must be 1 or larger");

const unsigned char *data = reinterpret_cast<const unsigned char *>(info.ptr);

Expand All @@ -28,6 +31,9 @@ PYBIND11_MODULE(binary2strings, m)

if(info.ndim != 1)
throw std::runtime_error("Only 1-dimensional arrays are supported");

if(min_chars < 1)
throw std::runtime_error("min_chars must be 1 or larger");

const unsigned char *data = reinterpret_cast<const unsigned char *>(info.ptr);
return try_extract_string_tuple(data, info.shape[0], 0, min_chars, only_interesting);
Expand Down
106 changes: 106 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest
# import our `pybind11`-based extension module
import binary2strings
import os
import sys

class MainTest(unittest.TestCase):
def test_extract_all_ascii_unicode(self):
Expand Down Expand Up @@ -129,6 +131,110 @@ def test_is_interesting(self):
self.assertEqual(result[i][0], interesting_strings[i])
self.assertEqual(result[i][3], True)

def test_min_chars(self):
for min_chars in range(2,10):
for test_string in ["a"*5, "한"*5]:
for encoding in ["utf-8", "utf-16"]:
data = bytes(test_string, encoding)

# Strip the BOM if present
if encoding == "utf-16":
data = data[2:]

description = f"min_chars={min_chars}, test_string={test_string}, encoding={encoding}, hex={data.hex()}"

# Test extraction of single strings
result = binary2strings.extract_all_strings(data, min_chars=min_chars)
if min_chars <= 5:
self.assertEqual(result[0][0], test_string, description)
else:
self.assertEqual(len(result), 0, description)

# Same test for extract_string
result = binary2strings.extract_string(data, min_chars=min_chars)
if min_chars <= 5:
self.assertEqual(result[0], test_string, description)
else:
self.assertEqual(result[0], "", description)

def test_corner_cases(self):
string = b"0000000000"
result = binary2strings.extract_all_strings(string)
self.assertEqual(len(result), 1, f"result={result}")

for min_chars in [1,2,10,1000]:
# String of nulls
string = b"\x00\x00\x00\x00\x00"
result = binary2strings.extract_all_strings(string, min_chars=min_chars)
self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}")
result = binary2strings.extract_string(string, min_chars=min_chars)
self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}")

string = b"\x00"
result = binary2strings.extract_all_strings(string, min_chars=min_chars)
self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}")
result = binary2strings.extract_string(string, min_chars=min_chars)
self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}")

# Empty string
string = b""
result = binary2strings.extract_all_strings(string, min_chars=min_chars)
self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}")
result = binary2strings.extract_string(string, min_chars=min_chars)
self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}")

# Short string
string = b"aa"
result = binary2strings.extract_all_strings(string, min_chars=min_chars)
if min_chars <= 2:
self.assertEqual(len(result), 1, f"min_chars={min_chars}, result={result}")
else:
self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}")

# Single character test
for string in ['a', "한"]:
for encoding in ["utf-8", "utf-16"]:
data = bytes(string, encoding)
result = binary2strings.extract_all_strings(data, min_chars=min_chars)
if min_chars <= 1:
self.assertEqual(len(result), 1, f"min_chars={min_chars}, result={result}")
else:
self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}")

# Strip the BOM if present
if encoding == "utf-16":
data = data[2:]

result = binary2strings.extract_string(data, min_chars=min_chars)
if min_chars <= 1:
self.assertEqual(len(result[0]), 1, f"min_chars={min_chars}, result={result}")
else:
self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}")

@unittest.skipIf(sys.platform != "win32", "requires Windows")
def test_bulk_files(self):
# Extract strings from exe's in %windir% to test for crashes
N_FILES = 200 # Number of exe's to extract strings from for testing
n_files = 0
n_strings = 0
for root, dirs, files in os.walk(os.environ["windir"]):
for file in files:
if file.endswith(".exe"):
path = os.path.join(root, file)
with open(path, "rb") as f:
data = f.read()
result = binary2strings.extract_all_strings(data)
self.assertGreater(len(result), 0, path)

n_files += 1
n_strings += len(result)
if n_files >= N_FILES:
# Actual value locally: 1,583,765
self.assertGreater(n_strings, 800000, "Not enough strings extracted")
return

self.assertEqual(True, False, "Not enough files tested")



if __name__ == '__main__':
Expand Down

0 comments on commit 63e7f6c

Please sign in to comment.