Skip to content

Commit

Permalink
Make typing compatible with older python versions
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 11, 2024
1 parent 106ebb0 commit 9df5451
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions scrapy_webarchive/cdxj.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
# based on https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/parser.py
import json
import re
from typing import List

from cdxj_indexer.main import CDXJIndexer

# based on https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/parser.py
CDXREC = re.compile(
r"^(?P<surt>(?P<host>[^\)\s]+)\)(?P<path>[^\?\s]+)?(\?(?P<query>\S+))?)"
r"\s(?P<datetime>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})(?:\d{3})?)"
r"\s(?P<data>{.*})"
)

def write_cdxj_index(output: str, inputs: list[str]) -> str:
wacz_indexer = CDXJIndexer(output=output, inputs=inputs)
wacz_indexer.process_all()
return output


class CdxjRecord:
def _parse(self, line):
Expand All @@ -34,3 +30,9 @@ def __init__(self, cdxline):

def __str__(self):
return str(self.__dict__)


def write_cdxj_index(output: str, inputs: List[str]) -> str:
wacz_indexer = CDXJIndexer(output=output, inputs=inputs)
wacz_indexer.process_all()
return output

0 comments on commit 9df5451

Please sign in to comment.