From c06557cb5ba7d3c9cf67e7e376fc9cbdac1c951d Mon Sep 17 00:00:00 2001 From: Geoff McDonald Date: Tue, 11 Jul 2023 20:55:41 -0700 Subject: [PATCH 1/3] Add check requiring min_size >= 1. --- src/bindings.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/bindings.cpp b/src/bindings.cpp index 75acba9..766a210 100644 --- a/src/bindings.cpp +++ b/src/bindings.cpp @@ -14,6 +14,9 @@ PYBIND11_MODULE(binary2strings, m) if(info.ndim != 1) throw std::runtime_error("Only 1-dimensional arrays are supported"); + + if(min_chars < 1) + throw std::runtime_error("min_chars must be 1 or larger"); const unsigned char *data = reinterpret_cast(info.ptr); @@ -28,6 +31,9 @@ PYBIND11_MODULE(binary2strings, m) if(info.ndim != 1) throw std::runtime_error("Only 1-dimensional arrays are supported"); + + if(min_chars < 1) + throw std::runtime_error("min_chars must be 1 or larger"); const unsigned char *data = reinterpret_cast(info.ptr); return try_extract_string_tuple(data, info.shape[0], 0, min_chars, only_interesting); From 1ee14b75fb6ee513f790834186037dc198c8e8af Mon Sep 17 00:00:00 2001 From: Geoff McDonald Date: Tue, 11 Jul 2023 21:20:19 -0700 Subject: [PATCH 2/3] Improve unit tests. --- tests/test.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/tests/test.py b/tests/test.py index 5439fdd..aa12dac 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,8 @@ import unittest # import our `pybind11`-based extension module import binary2strings +import os +import sys class MainTest(unittest.TestCase): def test_extract_all_ascii_unicode(self): @@ -129,6 +131,110 @@ def test_is_interesting(self): self.assertEqual(result[i][0], interesting_strings[i]) self.assertEqual(result[i][3], True) + def test_min_chars(self): + for min_chars in range(2,10): + for test_string in ["a"*5, "한"*5]: + for encoding in ["utf-8", "utf-16"]: + data = bytes(test_string, encoding) + + # Strip the BOM if present + if encoding == "utf-16": + data = data[2:] + + description = f"min_chars={min_chars}, test_string={test_string}, encoding={encoding}, hex={data.hex()}" + + # Test extraction of single strings + result = binary2strings.extract_all_strings(data, min_chars=min_chars) + if min_chars <= 5: + self.assertEqual(result[0][0], test_string, description) + else: + self.assertEqual(len(result), 0, description) + + # Same test for extract_string + result = binary2strings.extract_string(data, min_chars=min_chars) + if min_chars <= 5: + self.assertEqual(result[0], test_string, description) + else: + self.assertEqual(result[0], "", description) + + def test_corner_cases(self): + string = b"0000000000" + result = binary2strings.extract_all_strings(string) + self.assertEqual(len(result), 1, f"result={result}") + + for min_chars in [1,2,10,1000]: + # String of nulls + string = b"\x00\x00\x00\x00\x00" + result = binary2strings.extract_all_strings(string, min_chars=min_chars) + self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}") + result = binary2strings.extract_string(string, min_chars=min_chars) + self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}") + + string = b"\x00" + result = binary2strings.extract_all_strings(string, min_chars=min_chars) + self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}") + result = binary2strings.extract_string(string, min_chars=min_chars) + self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}") + + # Empty string + string = b"" + result = binary2strings.extract_all_strings(string, min_chars=min_chars) + self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}") + result = binary2strings.extract_string(string, min_chars=min_chars) + self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}") + + # Short string + string = b"aa" + result = binary2strings.extract_all_strings(string, min_chars=min_chars) + if min_chars <= 2: + self.assertEqual(len(result), 1, f"min_chars={min_chars}, result={result}") + else: + self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}") + + # Single character test + for string in ['a', "한"]: + for encoding in ["utf-8", "utf-16"]: + data = bytes(string, encoding) + result = binary2strings.extract_all_strings(data, min_chars=min_chars) + if min_chars <= 1: + self.assertEqual(len(result), 1, f"min_chars={min_chars}, result={result}") + else: + self.assertEqual(len(result), 0, f"min_chars={min_chars}, result={result}") + + # Strip the BOM if present + if encoding == "utf-16": + data = data[2:] + + result = binary2strings.extract_string(data, min_chars=min_chars) + if min_chars <= 1: + self.assertEqual(len(result[0]), 1, f"min_chars={min_chars}, result={result}") + else: + self.assertEqual(len(result[0]), 0, f"min_chars={min_chars}, result={result}") + + @unittest.skipIf(sys.platform != "win32", "requires Windows") + def test_bulk_files(self): + # Extract strings from exe's in %windir% to test for crashes + N_FILES = 200 # Number of exe's to extract strings from for testing + n_files = 0 + n_strings = 0 + for root, dirs, files in os.walk(os.environ["windir"]): + for file in files: + if file.endswith(".exe"): + path = os.path.join(root, file) + with open(path, "rb") as f: + data = f.read() + result = binary2strings.extract_all_strings(data) + self.assertGreater(len(result), 0, path) + + n_files += 1 + n_strings += len(result) + if n_files >= N_FILES: + # Actual value locally: 1,583,765 + self.assertGreater(n_strings, 800000, "Not enough strings extracted") + return + + self.assertEqual(True, False, "Not enough files tested") + if __name__ == '__main__': From aa4172e1ab27256844432d0903afeefb872a348d Mon Sep 17 00:00:00 2001 From: Geoff McDonald Date: Tue, 11 Jul 2023 21:21:04 -0700 Subject: [PATCH 3/3] Version bump. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fe02e6f..a1101b3 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from pybind11.setup_helpers import Pybind11Extension, build_ext from setuptools import setup, Extension, find_packages -__version__ = "0.1.8" +__version__ = "0.1.11" ext_modules = [ Pybind11Extension("binary2strings",