-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrilldecode.py
executable file
·82 lines (63 loc) · 2.71 KB
/
brilldecode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python3
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
# SPDX-License-Identifier: GPL-3.0-or-later
import bz2
import bs4
import html
from pathlib import Path, PurePath
from tqdm import tqdm
from brillcode import *
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("cdrom", help="path to the CDROM root")
parser.add_argument("output", help="output .slob file. Defaults to ei.slob", default="ei.slob")
args = parser.parse_args()
import slob
w = slob.create(PurePath(args.output))
linkedfiles = ["EncIslam.css"]
missingfiles = 0
entries = sorted(Path(PurePath(args.cdrom+"/Brill/Data/EncIslam")).glob('[CDS][0-9]/*.html'))
for entry in tqdm(entries, unit="entries"):
try:
f = bz2.open(entry,mode="rb")
f.peek(0)
except:
f = open(entry,mode="rb")
finally:
soup = bs4.BeautifulSoup(f.read().decode("raw_unicode_escape"), "html.parser")
f.close()
for tag in soup.findAll("form"):
tag.name = "span"
for tag in soup.findAll(class_=["Ba02", "Ba02SC", "mainentry"], string=True):
tag.string = tag.string.translate(brillcode)
for tag in soup.findAll(class_="contributor", string=True):
tag.string = html.unescape(tag.string)
soup.find("link")["href"] = "EncIslam.css"
for inlfig in soup.findAll(class_="inlFig"):
linkedfiles.append(str(PurePath(args.cdrom+"/Brill"+inlfig["src"])))
inlfig["src"] = PurePath(inlfig["src"]).name
title = soup.find("meta", attrs={"name": "blob"})["content"]
title = title.translate(specialchars)
title = bs4.BeautifulSoup(title, "html.parser")
for tag in title.findAll(class_=["Ba02", "Ba02SC", "mainentry"], string=True):
tag.string = tag.string.translate(brillcode)
soup.find("title").string = title.text
w.add(soup.encode("utf-8"), re.split(r"( \[.*\])", title.text)[0], ' '.join(soup.find(class_="fat").text.split()), entry.name.removesuffix(".html"), content_type=mimetypes[".html"])
for file in tqdm(sorted(set(linkedfiles)), unit="figures"):
try:
f = open(file,mode="rb")
w.add(f.read(), Path(file).name, content_type=mimetypes[PurePath(file).suffix])
f.close()
except:
missingfiles += 1
w.finalize()