forked from archlinux/archmanweb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update.py
executable file
·401 lines (330 loc) · 16.4 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#! /usr/bin/env python3
import argparse
import os.path
import logging
import datetime
from pathlib import PurePath
import subprocess
import chardet
import pyalpm
from finder import MANDIR, ManPagesFinder
# init django
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mysite.settings")
import django
django.setup()
from django.db import connection, transaction
from django.db.models import Count
from archweb_manpages.models import Package, Content, ManPage, SymbolicLink, UpdateLog, SoelimError
logger = logging.getLogger(__name__)
class UnknownManPath(Exception):
pass
def decode(text, *, encoding_hint=None):
CHARSETS = ["utf-8", "ascii", "iso-8859-1", "iso-8859-9", "iso-8859-15", "cp1250", "cp1252"]
if encoding_hint is not None:
CHARSETS.insert(0, encoding_hint)
for charset in CHARSETS:
try:
return text.decode(charset)
except UnicodeDecodeError:
pass
except LookupError:
# ignore invalid encoding_hint
pass
# fall back to chardet and errors="replace"
encoding = chardet.detect(text)["encoding"]
return text.decode(encoding, errors="replace")
def parse_man_path(path):
pp = PurePath(path)
man_name = pp.stem
man_section = pp.suffix[1:] # strip the dot
if not man_section:
raise UnknownManPath("empty section number")
# relative_to can succeed only if path is a subdir of MANDIR
if not path.startswith(MANDIR):
raise UnknownManPath
pp = pp.relative_to(MANDIR)
if pp.parts[0].startswith("man"):
man_lang = "en"
elif len(pp.parts) > 1 and pp.parts[1].startswith("man"):
man_lang = pp.parts[0]
else:
raise UnknownManPath
return man_name, man_section, man_lang
def update_packages(finder, *, force=False, only_repos=None):
updated_pkgs = []
# update packages in the django database
for db in finder.sync_db.get_syncdbs():
if only_repos and db.name not in only_repos:
continue
logger.info("Updating packages from repository '{}'...".format(db.name))
for pkg in db.pkgcache:
try:
db_package = Package.objects.get(repo=db.name, name=pkg.name)
if pyalpm.vercmp(db_package.version, pkg.version) == -1:
updated_pkgs.append(pkg)
elif force is True:
updated_pkgs.append(pkg)
else:
# skip void update of db_package
continue
except Package.DoesNotExist:
db_package = Package()
db_package.repo = db.name
db_package.name = pkg.name
db_package.arch = pkg.arch
updated_pkgs.append(pkg)
# update volatile fields (this is run iff the pkg was added to updated_pkgs)
db_package.version = pkg.version
db_package.description = pkg.desc
db_package.url = pkg.url
db_package.build_date = datetime.datetime.fromtimestamp(pkg.builddate, tz=datetime.timezone.utc)
db_package.licenses = pkg.licenses
db_package.save()
# delete old packages from the django database
for db_package in Package.objects.order_by("repo").order_by("name"):
if not finder.pkg_exists(db_package.repo, db_package.name):
Package.objects.filter(repo=db_package.repo, name=db_package.name).delete()
return updated_pkgs
def update_man_pages(finder, updated_pkgs):
logger.info("Updating man pages from {} packages...".format(len(updated_pkgs)))
updated_pages = 0
for pkg in updated_pkgs:
db_pkg = Package.objects.filter(repo=pkg.db.name, name=pkg.name)[0]
files = set(finder.get_man_files(pkg))
if not files:
continue
# set of unique keys (tuples) of pages present in the package,
# the rest will be deleted from the database
keys = set()
# insert/update man pages
for t, v1, v2 in finder.get_man_contents(pkg):
if t == "file":
path, content = v1, v2
# extract info from path, check if it makes sense
try:
man_name, man_section, man_lang = parse_man_path(path)
except UnknownManPath:
logger.warning("Skipping path with unrecognized structure: {}".format(path))
continue
# extract the encoding hint (see e.g. evim.1.ru.KOI8-R)
if "." in man_lang:
man_lang, encoding_hint = man_lang.split(".", maxsplit=1)
else:
encoding_hint = None
# decode the content
content = decode(content, encoding_hint=encoding_hint)
# django complains, the DBMS would drop it anyway
content = content.replace("\0", "")
if not content:
logger.warning("Skipping empty man page: {}".format(path))
continue
if (man_name, man_section, man_lang) in keys:
logger.debug("Skipping duplicate man page (maybe duplicate encoding): {}".format(path))
continue
keys.add( (man_name, man_section, man_lang) )
# find or create Content instance
try:
db_man = ManPage.objects.get(package_id=db_pkg.id, name=man_name, section=man_section, lang=man_lang)
db_content = db_man.content
except ManPage.DoesNotExist:
db_man = None
db_content = Content()
# update content
db_content.raw = content
db_content.html = None
db_content.txt = None
db_content.save()
# update newly-created ManPage instance
if db_man is None:
db_man = ManPage()
db_man.package_id = db_pkg.id
db_man.name = man_name
db_man.section = man_section
db_man.lang = man_lang
db_man.content = db_content
# db_man has to be saved after db_content, because django's
# validation is not deferrable (and db_content.id is not
# known until the content is saved)
db_man.full_clean()
# TODO: this might still fail if there are multiple foo.1 in different directories and same language
db_man.save()
updated_pages += 1
elif t == "hardlink":
# hardlinks can't point to non-existent files, so they can be stored in the ManPage table
source, target = v1, v2
# extract info from source, check if it makes sense
try:
source_name, source_section, source_lang = parse_man_path(source)
except UnknownManPath:
logger.warning("Skipping hardlink with unrecognized source path: {}".format(source))
continue
# extract info from target, check if it makes sense
try:
target_name, target_section, target_lang = parse_man_path(target)
except UnknownManPath:
logger.warning("Skipping hardlink with unrecognized target path: {}".format(target))
continue
# drop encoding from the lang (ru.KOI8-R)
if "." in source_lang:
source_lang, _ = source_lang.split(".", maxsplit=1)
if "." in target_lang:
target_lang, _ = target_lang.split(".", maxsplit=1)
# drop useless redirects
if target_lang == source_lang and target_section == source_section and target_name == source_name:
logger.warning("Skipping hardlink from {} to {} (the base name is the same).".format(source, target))
continue
if (source_name, source_section, source_lang) in keys:
logger.debug("Skipping duplicate hardlink: {}".format(source))
continue
keys.add( (source_name, source_section, source_lang) )
# save into database
man_target = ManPage.objects.get(package_id=db_pkg.id, name=target_name, section=target_section, lang=target_lang)
try:
man_source = ManPage.objects.get(package_id=db_pkg.id, name=source_name, section=source_section, lang=source_lang)
except ManPage.DoesNotExist:
man_source = ManPage(
package_id=db_pkg.id,
name=source_name,
section=source_section,
lang=source_lang
)
man_source.content_id = man_target.content_id
# validate and save
man_source.full_clean()
man_source.save()
updated_pages += 1
elif t == "symlink":
source, target = v1, v2
# extract info from source, check if it makes sense
try:
source_name, source_section, source_lang = parse_man_path(source)
except UnknownManPath:
logger.warning("Skipping symlink with unrecognized structure: {}".format(source))
continue
if target.startswith("/"):
# make target relative to "/"
target = target[1:]
else:
# make target full path
ppt = PurePath(source).parent / target
# normalize to remove any '..'
target = os.path.normpath(ppt)
# extract info from target, check if it makes sense
try:
target_name, target_section, target_lang = parse_man_path(target)
except UnknownManPath:
logger.warning("Skipping symlink with unknown target: {}".format(target))
continue
# drop encoding from the lang (ru.KOI8-R)
if "." in source_lang:
source_lang, _ = source_lang.split(".", maxsplit=1)
if "." in target_lang:
target_lang, _ = target_lang.split(".", maxsplit=1)
# drop cross-language symlinks
if target_lang != source_lang:
logger.warning("Skipping cross-language symlink from {} to {}".format(source, target))
continue
# drop useless redirects
if target_section == source_section and target_name == source_name:
logger.warning("Skipping symlink from {} to {} (the base name is the same).".format(source, target))
continue
# save into database
try:
db_link = SymbolicLink.objects.get(package_id=db_pkg.id, lang=source_lang, from_section=source_section, from_name=source_name)
except SymbolicLink.DoesNotExist:
db_link = SymbolicLink(
package_id=db_pkg.id,
lang=source_lang,
from_section=source_section,
from_name=source_name,
)
db_link.to_section = target_section
db_link.to_name = target_name
# validate and save
db_link.full_clean()
db_link.save()
else:
raise NotImplementedError("Unknown tarball entry type: {}".format(t))
# delete man pages whose files no longer exist
for db_man in ManPage.objects.filter(package_id=db_pkg.id):
if (db_man.name, db_man.section, db_man.lang) not in keys:
ManPage.objects.filter(package_id=db_pkg.id, name=db_man.name, section=db_man.section, lang=db_man.lang).delete()
# delete unreferenced rows from Content
unreferenced = Content.objects.filter(manpage_content__isnull=True).delete()
return updated_pages
if __name__ == "__main__":
# init logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter("{levelname:8} {message}", style="{")
handler.setFormatter(formatter)
logger.addHandler(handler)
parser = argparse.ArgumentParser(description="update man pages in the django database")
parser.add_argument("--force", action="store_true",
help="force an import of man pages from all packages, even if they were not updated recently")
parser.add_argument("--only-repos", action="store", nargs="+", metavar="NAME",
help="import packages (and man pages) only from these repositories")
parser.add_argument("--only-packages", action="store", nargs="+", metavar="NAME",
help="import man pages only from these packages")
parser.add_argument("--cache-dir", action="store", default="./.cache/",
help="path to the cache directory (default: %(default)s)")
parser.add_argument("--keep-tarballs", action="store_true",
help="keep downloaded package tarballs in the cache directory")
args = parser.parse_args()
start = datetime.datetime.now(tz=datetime.timezone.utc)
finder = ManPagesFinder(args.cache_dir)
finder.refresh()
# everything in a single transaction
with transaction.atomic():
updated_pkgs = update_packages(finder, force=args.force, only_repos=args.only_repos)
if args.only_packages is None:
count_updated_pages = update_man_pages(finder, updated_pkgs)
else:
count_updated_pages = update_man_pages(finder, [p for p in updated_pkgs if p.name in args.only_packages])
# this is called outside of the transaction, so that the cache can be reused on errors
if args.keep_tarballs is False:
finder.clear_pkgcache()
# update plain-text (convert_txt is fast, but without preprocessor)
convert_txt_returncode = None
if os.path.isfile("./convert_txt"):
_dbs = django.conf.settings.DATABASES["default"]
cmd = "./convert_txt --target {}@{} --user {} --password {}" \
.format(_dbs["NAME"], _dbs["HOST"] or "localhost", _dbs["USER"], _dbs["PASSWORD"])
p = subprocess.run(cmd, shell=True)
convert_txt_returncode = p.returncode
# update remaining plain-text which convert_txt could not handle
# (one transaction per update, otherwise we might hit memory allocation error)
def worker(man):
try:
man.get_converted("txt")
except SoelimError:
logger.error("SoelimError while converting {}.{}.{} to txt".format(man.name, man.section, man.lang))
except subprocess.CalledProcessError as e:
logger.error("CalledProcessError while converting {}.{}.{} to txt:\nreturncode = {}\nstderr = {}"
.format(man.name, man.section, man.lang, e.returncode, e.stderr))
queryset = ManPage.objects.only("package", "lang", "content_id", "converted_content_id").filter(content__txt=None).iterator()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(worker, queryset)
# VACUUM cannot run inside a transaction block
if updated_pkgs or args.only_packages is not None:
logger.info("Running VACUUM FULL ANALYZE on our tables...")
for Model in [Package, Content, ManPage, SymbolicLink]:
table = Model.objects.model._meta.db_table
logger.info("--> {}".format(table))
with connection.cursor() as cursor:
cursor.execute("VACUUM FULL ANALYZE {};".format(table))
end = datetime.datetime.now(tz=datetime.timezone.utc)
# log update
log = UpdateLog()
log.timestamp = start
log.duration = end - start
log.updated_pkgs = len(updated_pkgs)
log.updated_pages = count_updated_pages
log.stats_count_man_pages = ManPage.objects.count()
log.stats_count_symlinks = SymbolicLink.objects.count()
log.stats_count_all_pkgs = Package.objects.count()
log.stats_count_pkgs_with_mans = ManPage.objects.aggregate(Count("package_id", distinct=True))["package_id__count"]
log.convert_txt_returncode = convert_txt_returncode
log.save()