Add index_chunk_size option to CLI

RecRanger · Jun 5, 2024 · 37f585b · 37f585b
1 parent 73f983a
commit 37f585b
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "used_addr_check"
-version = "0.1.4"
+version = "0.1.5"
 authors = [
     { name="RecRanger", email="[email protected]" },
 ]

diff --git a/src/used_addr_check/__init__.py b/src/used_addr_check/__init__.py
@@ -1,4 +1,4 @@
-__VERSION__ = "0.1.4"
+__VERSION__ = "0.1.5"
 __AUTHOR__ = "RecRanger"
 
 from .index_create import (  # noqa F401

diff --git a/src/used_addr_check/cli.py b/src/used_addr_check/cli.py
@@ -3,6 +3,7 @@
 from used_addr_check.index_create import load_or_generate_index
 from used_addr_check.index_search import search_multiple_in_file
 from used_addr_check.scan_file import scan_file_for_used_addresses
+from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE
 
 import argparse
 import sys
@@ -20,6 +21,14 @@ def main_cli():
         action="store_true",
         help="Print version to stdout and exit",
     )
+    parser.add_argument(
+        "-i",
+        "--index-chunk-size",
+        dest="index_chunk_size",
+        type=int,
+        default=DEFAULT_INDEX_CHUNK_SIZE,
+        help="Size of chunks to store in the parquet index file",
+    )
     subparsers = parser.add_subparsers(dest="command")
 
     # # Subparser for the 'download' command
@@ -108,15 +117,21 @@ def main_cli():
         load_or_generate_index(
             haystack_file_path=Path(args.haystack_file_path),
             force_recreate=True,
+            index_chunk_size=args.index_chunk_size,
         )
     elif args.command == "search":
-        search_multiple_in_file(Path(args.haystack_file_path), args.needles)
+        search_multiple_in_file(
+            Path(args.haystack_file_path),
+            args.needles,
+            index_chunk_size=args.index_chunk_size,
+        )
     # elif args.command == "download":
     #     download_list(Path(args.output_path))
     elif args.command == "scan_file":
         scan_file_for_used_addresses(
             Path(args.haystack_file_path),
             Path(args.needle_haystack_file_path),
+            index_chunk_size=args.index_chunk_size,
         )
     else:
         parser.print_help()

diff --git a/src/used_addr_check/defaults.py b/src/used_addr_check/defaults.py
@@ -0,0 +1,2 @@
+# Tested 150, 1000, 10_000, 50_000, and found 10_000 to be optimal speed
+DEFAULT_INDEX_CHUNK_SIZE = 10_000
diff --git a/src/used_addr_check/index_create.py b/src/used_addr_check/index_create.py
@@ -7,10 +7,11 @@
 from tqdm import tqdm
 
 from used_addr_check.index_types import IndexEntry
+from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE
 
 
 def generate_index(
-    haystack_file_path: Path, index_chunk_size: int = 1000
+    haystack_file_path: Path, index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE
 ) -> List[IndexEntry]:
     """
     Generates an index for a large sorted text file, storing every
@@ -116,7 +117,7 @@ def load_index_parquet(index_parquet_file_path: Path) -> List[IndexEntry]:
 
 def load_or_generate_index(
     haystack_file_path: Path,
-    index_chunk_size: int = 1000,
+    index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
     force_recreate: bool = False,
 ) -> List[IndexEntry]:
     """Attempts to load an index from a file, or generates one if it doesn't,

diff --git a/src/used_addr_check/index_search.py b/src/used_addr_check/index_search.py
@@ -7,6 +7,7 @@
 
 from used_addr_check.index_create import load_or_generate_index
 from used_addr_check.index_types import IndexEntry
+from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE
 
 
 def _binary_search_index(index: List[IndexEntry], needle: str) -> int:
@@ -93,7 +94,7 @@ def search_in_file_with_index(
 def search_multiple_in_file(
     haystack_file_path: Path | str,
     needles: List[str] | str,
-    index_chunk_size: int = 1000,
+    index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
 ) -> List[str]:
     """
     Searches for multiple needle strings in the file by pre-building an index

diff --git a/src/used_addr_check/scan_file.py b/src/used_addr_check/scan_file.py
@@ -6,7 +6,7 @@
 from ripgrepy import Ripgrepy, RipGrepNotFound
 
 from used_addr_check.index_search import search_multiple_in_file
-
+from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE
 
 # BITCOIN_ADDR_REGEX = r"[13][a-km-zA-HJ-NP-Z1-9]{25,34}"
 
@@ -119,7 +119,9 @@ def extract_addresses_from_file(
 
 
 def scan_file_for_used_addresses(
-    haystack_file_path: Path, needle_file_path: Path
+    haystack_file_path: Path,
+    needle_file_path: Path,
+    index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
 ):
     """
     Scans a file for bitcoin addresses, and see which one have been used.
@@ -151,6 +153,8 @@ def scan_file_for_used_addresses(
         )
 
     matched_addresses = search_multiple_in_file(
-        haystack_file_path, needle_addresses
+        haystack_file_path,
+        needles=needle_addresses,
+        index_chunk_size=index_chunk_size,
     )
     logger.info(f"Found {len(matched_addresses):,} used addresses in the file")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Tested 150, 1000, 10_000, 50_000, and found 10_000 to be optimal speed
		DEFAULT_INDEX_CHUNK_SIZE = 10_000