Skip to content

Commit e63c790

Browse files
committed
Initial commit
1 parent 89b67fc commit e63c790

5 files changed

+420
-0
lines changed

lang_codes.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Thu Apr 30 19:04:01 2015
4+
5+
@author: misha
6+
"""
7+
8+
import urllib2, csv
9+
from bs4 import BeautifulSoup
10+
#from unicodedata import normalize
11+
12+
page = urllib2.urlopen("https://meta.wikimedia.org/wiki/Special:SiteMatrix").read()
13+
soup = BeautifulSoup(page)
14+
lang_table = soup.find('table', id="mw-sitematrix-table")
15+
#print lang_table
16+
17+
out = open('wiki_lang_codes.csv', 'wb')
18+
writer = csv.DictWriter(out, fieldnames = ['language', 'link'], dialect='excel')
19+
writer.writeheader()
20+
21+
for row in lang_table.findAll('tr'):
22+
try:
23+
lang = row.findAll('td')[1].text
24+
link = row.findAll('td')[1].a.get('href')
25+
print lang, link
26+
writer.writerow({'language': lang, 'link': link[2:]})
27+
except IndexError:
28+
print 'not a language row:', row
29+
30+
out.close()
31+

lang_not_wikiq_codes.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Thu Apr 30 19:04:01 2015
4+
5+
@author: misha
6+
"""
7+
8+
import urllib2, csv
9+
from bs4 import BeautifulSoup
10+
#from unicodedata import normalize
11+
12+
page = urllib2.urlopen("https://meta.wikimedia.org/wiki/Special:SiteMatrix").read()
13+
soup = BeautifulSoup(page)
14+
lang_table = soup.find('table', id="mw-sitematrix-table")
15+
#print lang_table
16+
17+
wiki = []
18+
for row in lang_table.findAll('tr'):
19+
try:
20+
r = row.contents
21+
22+
lang = r[0].text
23+
code = r[1].text
24+
link = r[1].find_all('a')[0]
25+
print code, link['href'][2:]
26+
wiki.append((lang, code, link['href'][2:]))
27+
#writer.writerow({'language': lang, 'link': link[2:]})
28+
except IndexError:
29+
print 'not a language row:', row
30+
31+
page = urllib2.urlopen("http://stats.wikimedia.org/wiktionary/EN/Sitemap.htm").read()
32+
soup = BeautifulSoup(page)
33+
wiktionaries = soup.find('table', id="table1")
34+
#print str(wiktionaries)
35+
wiktio_langs = []
36+
37+
for row in wiktionaries.findAll('tr'):
38+
try:
39+
lang = row.findAll('td')[2].text
40+
wiktio_langs.append(lang)
41+
except IndexError:
42+
print 'not a language row:', row
43+
44+
wiktio_langs = wiktio_langs[2:128]
45+
46+
print wiktio_langs
47+
48+
out = open('wiki_wikti_lang_diff.txt', 'wb')
49+
writer = csv.DictWriter(out, fieldnames = ['language', 'code', 'link'], dialect='excel')
50+
writer.writeheader()
51+
52+
#wiki_codes = set(wiki)
53+
wiktio_codes = set(wiktio_langs)
54+
55+
diff = []
56+
57+
for l in wiki[1:-1]:
58+
if l[1] not in wiktio_codes:
59+
diff.append(l[1])
60+
try:
61+
writer.writerow({'language': l[0].encode('utf-8'), 'code': l[1].encode('utf-8'), 'link': l[2].encode('utf-8')})
62+
except UnicodeEncodeError:
63+
print l
64+
65+
print diff
66+
67+
out.close()
68+
+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Apr 20 21:29:14 2015
4+
5+
@author: Tao Steel
6+
"""
7+
8+
import urllib2, csv
9+
from bs4 import BeautifulSoup
10+
#from unicodedata import normalize
11+
12+
13+
wiki_langs = [u'aa', u'ab', u'ace', u'ak', u'als', u'an', u'arc', u'arz', u'as',
14+
u'av', u'ba', u'bar', u'bat-smg', u'bcl', u'be-x-old', u'bh', u'bi',
15+
u'bjn', u'bm', u'bo', u'bpy', u'bug', u'bxr', u'cbk-zam', u'cdo',
16+
u'ce', u'ceb', u'ch', u'cho', u'chy', u'ckb', u'cr', u'crh', u'cu',
17+
u'cv', u'diq', u'dsb', u'dz', u'ee', u'eml', u'ext', u'ff', u'fiu-vro',
18+
u'frp', u'frr', u'fur', u'gag', u'gan', u'glk', u'got', u'hak',
19+
u'haw', u'hif', u'ho', u'hsb', u'ht', u'hz', u'ia', u'ie', u'ig',
20+
u'ii', u'ik', u'ilo', u'iu', u'kaa', u'kab', u'kbd', u'kg', u'ki',
21+
u'kj', u'koi', u'kr', u'krc', u'ksh', u'kv', u'kw', u'lad', u'lbe',
22+
u'lez', u'lg', u'lij', u'lmo', u'ltg', u'mai', u'map-bms', u'mdf',
23+
u'mh', u'mhr', u'min', u'mo', u'mr', u'mrj', u'mt', u'mus', u'mwl',
24+
u'myv', u'mzn', u'nap', u'nds-nl', u'ne', u'new', u'ng', u'nov',
25+
u'nrm', u'nso', u'nv', u'ny', u'om', u'os', u'pa', u'pag', u'pam',
26+
u'pap', u'pcd', u'pdc', u'pfl', u'pi', u'pih', u'pms', u'pnt',
27+
u'qu', u'rm', u'rmy', u'rn', u'roa-tara', u'rue', u'rw', u'sah',
28+
u'sc', u'sco', u'sd', u'se', u'si', u'sn', u'srn', u'ss', u'st',
29+
u'stq', u'szl', u'tet', u'to', u'ts', u'tum', u'tw', u'ty', u'tyv',
30+
u'udm', u've', u'vec', u'vep', u'vls', u'war', u'wuu', u'xal', u'xh',
31+
u'xmf', u'yo', u'za', u'zea', u'zh-classical', u'zh-yue', u'zu']
32+
33+
34+
35+
baselink = ['http://stats.wikimedia.org/EN/TablesWikipedia', '.htm']
36+
37+
wiki_links = []
38+
for w in wiki_langs:
39+
if '-' in w:
40+
w = w.replace('-','_')
41+
link = baselink[0] + w.upper() + baselink[1]
42+
wiki_links.append([link, w])
43+
44+
print wiki_links
45+
46+
good_links = []
47+
# check if links are valid (they still could be bad, though)
48+
for l in wiki_links:
49+
link = l[0]
50+
try:
51+
urllib2.urlopen(link)
52+
good_links.append(l)
53+
except urllib2.HTTPError, e:
54+
print e.code, link
55+
except urllib2.URLError, e:
56+
print e.args, link
57+
58+
wiki_links = good_links
59+
60+
#wiki_links = [['http://stats.wikimedia.org/EN/TablesWikipediaEO.htm', 'en', '0']]
61+
62+
out = open('wiki_contributors.txt', 'wb')
63+
writer = csv.DictWriter(out, fieldnames = ['username', 'edits, articles, 30 dy', 'edits, other, 30 dy', 'creates, articles, 30 dy', 'creates, other, 30 dy', 'link', 'lang'], dialect='excel')
64+
writer.writeheader()
65+
66+
errors = []
67+
68+
for l in wiki_links:
69+
lang_link = l[0]
70+
page = urllib2.urlopen(lang_link).read()
71+
soup = BeautifulSoup(page, "html.parser")
72+
user_table = soup.find('table', id="table2")
73+
74+
try:
75+
rows = user_table.findAll('tr')[3:]
76+
except AttributeError, e:
77+
print l[1], e
78+
continue
79+
for r in rows:
80+
name = r.a.text
81+
name = name.encode('utf-8')
82+
link = r.a.get('href')
83+
link = link.encode('utf-8')
84+
user_data = r.findAll('td', { "class" : "rbg" })
85+
user_data = [x.text for x in user_data]
86+
try:
87+
writer.writerow({'username': name, 'edits, articles, 30 dy': user_data[0], 'edits, other, 30 dy': user_data[1], 'creates, articles, 30 dy': user_data[2], 'creates, other, 30 dy': user_data[3], 'link': link, 'lang': l[1]})
88+
except UnicodeEncodeError:
89+
errors.append([name, user_data[0], user_data[1], user_data[2], user_data[3], link, l[1]])
90+
except IndexError:
91+
print name, user_data, link
92+
93+
out.close()
94+
95+
#for e in errors:
96+
# print e[0]
97+
#print '*******************************'
98+
#for e in errors:
99+
# print e[1]
100+
#print '*******************************'
101+
#for e in errors:
102+
# print e[2]
103+
#print '*******************************'
104+
#for e in errors:
105+
# print e[3]
106+
#print '*******************************'
107+
#for e in errors:
108+
# print e[4]
109+
#print '*******************************'
110+
#for e in errors:
111+
# print e[5]
112+
#print '*******************************'
113+
#for e in errors:
114+
# print e[6]
115+
116+

wikipedia_contributors.py

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Apr 20 21:29:14 2015
4+
5+
@author: Tao Steel
6+
"""
7+
8+
import urllib2, csv
9+
from bs4 import BeautifulSoup
10+
#from unicodedata import normalize
11+
12+
page = urllib2.urlopen("http://stats.wikimedia.org/EN/Sitemap.htm").read()
13+
soup = BeautifulSoup(page)
14+
wikies = soup.find('table', id="table2")
15+
#print str(wikies)
16+
wiki_langs = []
17+
step = 0
18+
for row in wikies.findAll('tr'):
19+
try:
20+
lang = row.findAll('td')[4].text
21+
wiki_langs.append(lang)
22+
except IndexError:
23+
print 'not a language row:', row
24+
25+
wiki_langs = wiki_langs[3:]
26+
27+
print wiki_langs, len(set(wiki_langs))
28+
29+
baselink = ['http://stats.wikimedia.org/EN/TablesWikipedia', '.htm']
30+
31+
wiki_links = []
32+
for w in wiki_langs:
33+
if '-' in w:
34+
w = w.replace('-','_')
35+
link = baselink[0] + w.upper() + baselink[1]
36+
wiki_links.append([link, w])
37+
38+
print wiki_links
39+
40+
# check if links are valid (they still could be bad, though)
41+
for l in wiki_links:
42+
link = l[0]
43+
try:
44+
urllib2.urlopen(link)
45+
except urllib2.HTTPError, e:
46+
print e.code, link
47+
except urllib2.URLError, e:
48+
print e.args, link
49+
except HTTPError:
50+
print link
51+
52+
#wiki_links = [['http://stats.wikimedia.org/EN/TablesWikipediaEO.htm', 'en', '0']]
53+
54+
out = open('wiki_contributors.csv', 'wb')
55+
writer = csv.DictWriter(out, fieldnames = ['username', 'edits, articles, 30 dy', 'edits, other, 30 dy', 'creates, articles, 30 dy', 'creates, other, 30 dy', 'link', 'lang'], dialect='excel')
56+
writer.writeheader()
57+
58+
errors = []
59+
60+
for l in wiki_links:
61+
lang_link = l[0]
62+
page = urllib2.urlopen(lang_link).read()
63+
soup = BeautifulSoup(page, "html.parser")
64+
user_table = soup.find('table', id="table2")
65+
66+
try:
67+
rows = user_table.findAll('tr')[3:]
68+
except AttributeError, e:
69+
print l[1], e
70+
continue
71+
for r in rows:
72+
name = r.a.text
73+
name = name.encode('utf-8')
74+
link = r.a.get('href')
75+
link = link.encode('utf-8')
76+
user_data = r.findAll('td', { "class" : "rbg" })
77+
user_data = [x.text for x in user_data]
78+
try:
79+
writer.writerow({'username': name, 'edits, articles, 30 dy': user_data[0], 'edits, other, 30 dy': user_data[1], 'creates, articles, 30 dy': user_data[2], 'creates, other, 30 dy': user_data[3], 'link': link, 'lang': l[1]})
80+
except UnicodeEncodeError:
81+
errors.append([name, user_data[0], user_data[1], user_data[2], user_data[3], link, l[1]])
82+
except IndexError:
83+
print name, user_data, link
84+
85+
out.close()
86+
87+
#for e in errors:
88+
# print e[0]
89+
#print '*******************************'
90+
#for e in errors:
91+
# print e[1]
92+
#print '*******************************'
93+
#for e in errors:
94+
# print e[2]
95+
#print '*******************************'
96+
#for e in errors:
97+
# print e[3]
98+
#print '*******************************'
99+
#for e in errors:
100+
# print e[4]
101+
#print '*******************************'
102+
#for e in errors:
103+
# print e[5]
104+
#print '*******************************'
105+
#for e in errors:
106+
# print e[6]
107+
108+

0 commit comments

Comments
 (0)