This repository was archived by the owner on Nov 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_stats.py
71 lines (49 loc) · 2.23 KB
/
generate_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
from __future__ import division
import sys
from math import ceil
from collections import defaultdict
import codecs
import json
from multiprocessing import Pool as ThreadPool
def stats(page, source, target):
if target in ignore_langs:
return
with codecs.open(u"data/{1}.{0}/{1}.json".format(page, source, target), "r", "utf-8-sig") as f:
left_links = json.load(f)
with codecs.open(u"data/{1}.{0}/{1}.{2}.json".format(page, source, target), "r", "utf-8-sig") as f:
left_links_translated = json.load(f)
with codecs.open(u"data/{1}.{0}/{2}.json".format(page, source, target), "r", "utf-8-sig") as f:
right_links = json.load(f)
with codecs.open(u"data/{1}.{0}/{2}.{1}.json".format(page, source, target), "r", "utf-8-sig") as f:
right_links_translated = json.load(f)
# print set(left_links) - { x[0] for x in left_links_translated.items() }
stats = {
"left": len(left_links),
"right": len(right_links),
"left_untranslated": len(set(left_links) - { x[0] for x in left_links_translated.items() }),
"left_absent": len( { x[1] for x in left_links_translated.items() } - set(right_links)),
"intersection": len({ x[0] for x in left_links_translated.items() } & { x[1] for x in right_links_translated.items() }),
"right_absent": len( { x[1] for x in right_links_translated.items() } - set(left_links)),
"right_untranslated": len(set(right_links) - { x[0] for x in right_links_translated.items() })
}
return stats
def compute_stats(source):
lang = "en"
if "#" in source:
lang = source.split("#")[1]
source = source.split("#")[0]
langlinks = json.load(codecs.open(u"data/{1}.{0}.json".format(source, lang), "r", "utf-8-sig"))
result = { l: stats(source, lang, l) for l in langlinks.keys() }
with codecs.open(u"data/{1}.{0}.stats.json".format(source, lang), "w", "utf-8-sig") as f:
json.dump(result, f, ensure_ascii=False, indent=2, separators=(',', ': '))
ignore_langs = [ "th" ]
ignore_langs = []
if __name__ == "__main__":
if len(sys.argv) < 2:
sources = ["Love", "Revolution", "Wisdom", "Ethics", "Morality", "Surveillance"]
sources = [ "Russia", "Crimea", "Ukraine"]
else:
sources = sys.argv[1:]
print sources
map(compute_stats, sources)