-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlang_not_wikiq_codes.py
84 lines (67 loc) · 2.33 KB
/
lang_not_wikiq_codes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 30 19:04:01 2015
@author: misha
"""
import urllib2, csv
from bs4 import BeautifulSoup
#from unicodedata import normalize
def get_wiki_langs():
page = urllib2.urlopen("https://meta.wikimedia.org/wiki/Special:SiteMatrix").read()
soup = BeautifulSoup(page)
lang_table = soup.find('table', id="mw-sitematrix-table")
#print lang_table
wiki = []
for row in lang_table.findAll('tr'):
try:
r = row.contents
lang = r[0].text
code = r[1].text
link = r[1].find_all('a')[0]
print code, link['href'][2:]
wiki.append((lang, code, link['href'][2:]))
#writer.writerow({'language': lang, 'link': link[2:]})
except IndexError:
print 'not a language row:', row
return wiki
def get_wiktio_langs():
page = urllib2.urlopen("http://stats.wikimedia.org/wiktionary/EN/Sitemap.htm").read()
soup = BeautifulSoup(page)
wiktionaries = soup.find('table', id="table1")
#print str(wiktionaries)
wiktio_langs = []
for row in wiktionaries.findAll('tr')[1:-1]:
try:
lang = row.findAll('td')[2].text
wiktio_langs.append(lang)
except IndexError:
print 'not a language row:', row
wiktio_langs = wiktio_langs[1:-16]
print wiktio_langs
return wiktio_langs
def compare_langs(wiki, wiktio_langs):
out = open('wiki_wikti_lang_diff.txt', 'wb')
writer = csv.DictWriter(out, fieldnames = ['language', 'code', 'link'], dialect='excel')
writer.writeheader()
#wiki_codes = set(wiki)
wiktio_codes = set(wiktio_langs)
diff = []
for l in wiki[1:-1]:
if l[1] not in wiktio_codes:
diff.append(l[1])
try:
writer.writerow({'language': l[0].encode('utf-8'), 'code': l[1].encode('utf-8'), 'link': l[2].encode('utf-8')})
except UnicodeEncodeError:
print l
print diff
out.close()
return diff
def find_diff():
wiki = get_wiki_langs()
wiktio = get_wiktio_langs()
result = compare_langs(wiki, wiktio)
return result
if __name__ == "__main__":
wiki = get_wiki_langs()
wiktio = get_wiktio_langs()
result = compare_langs(wiki, wiktio)