Skip to content

Commit

Permalink
Add index_chunk_size option to CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
RecRanger committed Jun 5, 2024
1 parent 73f983a commit 37f585b
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 9 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "used_addr_check"
version = "0.1.4"
version = "0.1.5"
authors = [
{ name="RecRanger", email="[email protected]" },
]
Expand Down
2 changes: 1 addition & 1 deletion src/used_addr_check/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__VERSION__ = "0.1.4"
__VERSION__ = "0.1.5"
__AUTHOR__ = "RecRanger"

from .index_create import ( # noqa F401
Expand Down
17 changes: 16 additions & 1 deletion src/used_addr_check/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from used_addr_check.index_create import load_or_generate_index
from used_addr_check.index_search import search_multiple_in_file
from used_addr_check.scan_file import scan_file_for_used_addresses
from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE

import argparse
import sys
Expand All @@ -20,6 +21,14 @@ def main_cli():
action="store_true",
help="Print version to stdout and exit",
)
parser.add_argument(
"-i",
"--index-chunk-size",
dest="index_chunk_size",
type=int,
default=DEFAULT_INDEX_CHUNK_SIZE,
help="Size of chunks to store in the parquet index file",
)
subparsers = parser.add_subparsers(dest="command")

# # Subparser for the 'download' command
Expand Down Expand Up @@ -108,15 +117,21 @@ def main_cli():
load_or_generate_index(
haystack_file_path=Path(args.haystack_file_path),
force_recreate=True,
index_chunk_size=args.index_chunk_size,
)
elif args.command == "search":
search_multiple_in_file(Path(args.haystack_file_path), args.needles)
search_multiple_in_file(
Path(args.haystack_file_path),
args.needles,
index_chunk_size=args.index_chunk_size,
)
# elif args.command == "download":
# download_list(Path(args.output_path))
elif args.command == "scan_file":
scan_file_for_used_addresses(
Path(args.haystack_file_path),
Path(args.needle_haystack_file_path),
index_chunk_size=args.index_chunk_size,
)
else:
parser.print_help()
Expand Down
2 changes: 2 additions & 0 deletions src/used_addr_check/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Tested 150, 1000, 10_000, 50_000, and found 10_000 to be optimal speed
DEFAULT_INDEX_CHUNK_SIZE = 10_000
5 changes: 3 additions & 2 deletions src/used_addr_check/index_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from tqdm import tqdm

from used_addr_check.index_types import IndexEntry
from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE


def generate_index(
haystack_file_path: Path, index_chunk_size: int = 1000
haystack_file_path: Path, index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE
) -> List[IndexEntry]:
"""
Generates an index for a large sorted text file, storing every
Expand Down Expand Up @@ -116,7 +117,7 @@ def load_index_parquet(index_parquet_file_path: Path) -> List[IndexEntry]:

def load_or_generate_index(
haystack_file_path: Path,
index_chunk_size: int = 1000,
index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
force_recreate: bool = False,
) -> List[IndexEntry]:
"""Attempts to load an index from a file, or generates one if it doesn't,
Expand Down
3 changes: 2 additions & 1 deletion src/used_addr_check/index_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from used_addr_check.index_create import load_or_generate_index
from used_addr_check.index_types import IndexEntry
from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE


def _binary_search_index(index: List[IndexEntry], needle: str) -> int:
Expand Down Expand Up @@ -93,7 +94,7 @@ def search_in_file_with_index(
def search_multiple_in_file(
haystack_file_path: Path | str,
needles: List[str] | str,
index_chunk_size: int = 1000,
index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
) -> List[str]:
"""
Searches for multiple needle strings in the file by pre-building an index
Expand Down
10 changes: 7 additions & 3 deletions src/used_addr_check/scan_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ripgrepy import Ripgrepy, RipGrepNotFound

from used_addr_check.index_search import search_multiple_in_file

from used_addr_check.defaults import DEFAULT_INDEX_CHUNK_SIZE

# BITCOIN_ADDR_REGEX = r"[13][a-km-zA-HJ-NP-Z1-9]{25,34}"

Expand Down Expand Up @@ -119,7 +119,9 @@ def extract_addresses_from_file(


def scan_file_for_used_addresses(
haystack_file_path: Path, needle_file_path: Path
haystack_file_path: Path,
needle_file_path: Path,
index_chunk_size: int = DEFAULT_INDEX_CHUNK_SIZE,
):
"""
Scans a file for bitcoin addresses, and see which one have been used.
Expand Down Expand Up @@ -151,6 +153,8 @@ def scan_file_for_used_addresses(
)

matched_addresses = search_multiple_in_file(
haystack_file_path, needle_addresses
haystack_file_path,
needles=needle_addresses,
index_chunk_size=index_chunk_size,
)
logger.info(f"Found {len(matched_addresses):,} used addresses in the file")

0 comments on commit 37f585b

Please sign in to comment.