-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwarc.py
127 lines (95 loc) · 3.83 KB
/
warc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: UTF-8 -*-
import os
from tempfile import NamedTemporaryFile
import shutil
from datetime import datetime
from hanzo.warctools import warc, WarcRecord
from hanzo.httptools import ResponseMessage
CONFORMS_TO = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"
class Warc(object):
MAIN_URL = "WARC-X-Main-URL"
_main_url = None
_file_name = None
_warc_file_read = None
_warc_file_write = None
_temporary = None
_read_only = False
def __init__(self, file_name, temporary=False, read_only=False, **kwargs):
self._main_url = kwargs.get("main_url")
self._file_name = file_name
self._temporary = temporary
self._read_only = read_only if not self._temporary else False
if self._temporary:
if not self._main_url:
raise ValueError("Missing required argument: main_url")
self._warc_file_read = NamedTemporaryFile("rb")
self._warc_file_write = open(self._warc_file_read.name, "wb")
self._init_file()
else:
if self._read_only:
self._warc_file_read = open(file_name, "rb")
else:
self._warc_file_read = open(file_name, "rb")
self._warc_file_write = open(file_name, "ab")
def find_record(self, url):
self._warc_file_read.seek(0)
wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
gzip="record")
for (offset, record, errors) in wrs.read_records(limit=None):
if record and (record.type == WarcRecord.RESPONSE) \
and (record.content[0] == ResponseMessage.CONTENT_TYPE) \
and (record.url == url):
return record
return None
def write_record(self, record):
if self._read_only:
raise RuntimeError("WARC opened for read-only access")
self._warc_file_write.seek(0, os.SEEK_END)
record.write_to(self._warc_file_write, gzip=True)
self._warc_file_write.flush()
def make_permanent(self):
if not self._temporary:
raise RuntimeError("This WARC is not temporary")
warc_file = open(self._file_name, "wb")
self._warc_file_read.seek(0)
# copy temp file to it's permanent location
shutil.copyfileobj(self._warc_file_read, warc_file)
warc_file.flush()
self._warc_file_read = open(self._file_name, "rb")
self._warc_file_write = warc_file
self._temporary = False
@property
def main_url(self):
return self._main_url
@property
def temporary(self):
return self._temporary
@property
def read_only(self):
return self._read_only
def _init_file(self):
warcinfo_headers = [
(WarcRecord.TYPE, WarcRecord.WARCINFO),
(WarcRecord.ID, WarcRecord.random_warc_uuid()),
(WarcRecord.DATE, warc.warc_datetime_str(datetime.utcnow())),
(WarcRecord.FILENAME, os.path.basename(self._file_name)),
(Warc.MAIN_URL, self._main_url),
]
warcinfo_fields = "\r\n".join([
"software: bardo",
"format: WARC File Format 1.0",
"conformsTo: " + CONFORMS_TO,
"robots: unknown",
])
warcinfo_content = ("application/warc-fields", warcinfo_fields)
warcinfo_record = WarcRecord(headers=warcinfo_headers, \
content=warcinfo_content)
self.write_record(warcinfo_record)
def _load_warc_info(self):
self._warc_file_read.seek(0)
wrs = WarcRecord.open_archive(file_handle=self._warc_file_read, \
gzip="record")
temp = wrs.read_records(limit=1)
if not temp or (temp[0].type != WarcRecord.WARCINFO):
raise ValueError("WARC info not found")
return temp[0]