diff --git a/scrapy_webarchive/cdxj.py b/scrapy_webarchive/cdxj.py index 7a5b2b1..9dc927b 100644 --- a/scrapy_webarchive/cdxj.py +++ b/scrapy_webarchive/cdxj.py @@ -1,20 +1,16 @@ -# based on https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/parser.py import json import re +from typing import List from cdxj_indexer.main import CDXJIndexer +# based on https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/parser.py CDXREC = re.compile( r"^(?P(?P[^\)\s]+)\)(?P[^\?\s]+)?(\?(?P\S+))?)" r"\s(?P(?P\d{4})(?P\d{2})(?P\d{2})(?P\d{2})(?P\d{2})(?P\d{2})(?:\d{3})?)" r"\s(?P{.*})" ) -def write_cdxj_index(output: str, inputs: list[str]) -> str: - wacz_indexer = CDXJIndexer(output=output, inputs=inputs) - wacz_indexer.process_all() - return output - class CdxjRecord: def _parse(self, line): @@ -34,3 +30,9 @@ def __init__(self, cdxline): def __str__(self): return str(self.__dict__) + + +def write_cdxj_index(output: str, inputs: List[str]) -> str: + wacz_indexer = CDXJIndexer(output=output, inputs=inputs) + wacz_indexer.process_all() + return output