-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbuild_lang_table.py
executable file
·49 lines (34 loc) · 1.02 KB
/
build_lang_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env pythonw
import sys
import gzip
import string
import re
import numpy as np
letter_regex = re.compile(r'[a-z]', re.I)
characters = sorted(set(string.ascii_letters.lower()))
pos = dict(zip(characters, range(len(characters))))
counts = np.zeros(len(characters), dtype='uint64')
line_count = 0
for filename in sys.argv[1:]:
for line in gzip.open(filename, "rt"):
fields = line.lower().strip().split()
line_count += 1
if line_count % 100000 == 0:
print(filename, line_count)
break
count = int(fields[2])
word = fields[0]
if "_" in word:
continue
letters = letter_regex.findall(word)
if len(letters) != len(word):
continue
for letter in letters:
if letter not in pos:
continue
counts[pos[letter]] += count
total = np.sum(counts)
pos = list(pos.items())
pos.sort(key=lambda x: x[1])
for key, value in enumerate(pos):
print(value[0], counts[key]/total)