1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Apr 20 21:29:14 2015
4
+
5
+ @author: Tao Steel
6
+ """
7
+
8
+ import urllib2 , csv
9
+ from bs4 import BeautifulSoup
10
+ #from unicodedata import normalize
11
+
12
+
13
+ wiki_langs = [u'aa' , u'ab' , u'ace' , u'ak' , u'als' , u'an' , u'arc' , u'arz' , u'as' ,
14
+ u'av' , u'ba' , u'bar' , u'bat-smg' , u'bcl' , u'be-x-old' , u'bh' , u'bi' ,
15
+ u'bjn' , u'bm' , u'bo' , u'bpy' , u'bug' , u'bxr' , u'cbk-zam' , u'cdo' ,
16
+ u'ce' , u'ceb' , u'ch' , u'cho' , u'chy' , u'ckb' , u'cr' , u'crh' , u'cu' ,
17
+ u'cv' , u'diq' , u'dsb' , u'dz' , u'ee' , u'eml' , u'ext' , u'ff' , u'fiu-vro' ,
18
+ u'frp' , u'frr' , u'fur' , u'gag' , u'gan' , u'glk' , u'got' , u'hak' ,
19
+ u'haw' , u'hif' , u'ho' , u'hsb' , u'ht' , u'hz' , u'ia' , u'ie' , u'ig' ,
20
+ u'ii' , u'ik' , u'ilo' , u'iu' , u'kaa' , u'kab' , u'kbd' , u'kg' , u'ki' ,
21
+ u'kj' , u'koi' , u'kr' , u'krc' , u'ksh' , u'kv' , u'kw' , u'lad' , u'lbe' ,
22
+ u'lez' , u'lg' , u'lij' , u'lmo' , u'ltg' , u'mai' , u'map-bms' , u'mdf' ,
23
+ u'mh' , u'mhr' , u'min' , u'mo' , u'mr' , u'mrj' , u'mt' , u'mus' , u'mwl' ,
24
+ u'myv' , u'mzn' , u'nap' , u'nds-nl' , u'ne' , u'new' , u'ng' , u'nov' ,
25
+ u'nrm' , u'nso' , u'nv' , u'ny' , u'om' , u'os' , u'pa' , u'pag' , u'pam' ,
26
+ u'pap' , u'pcd' , u'pdc' , u'pfl' , u'pi' , u'pih' , u'pms' , u'pnt' ,
27
+ u'qu' , u'rm' , u'rmy' , u'rn' , u'roa-tara' , u'rue' , u'rw' , u'sah' ,
28
+ u'sc' , u'sco' , u'sd' , u'se' , u'si' , u'sn' , u'srn' , u'ss' , u'st' ,
29
+ u'stq' , u'szl' , u'tet' , u'to' , u'ts' , u'tum' , u'tw' , u'ty' , u'tyv' ,
30
+ u'udm' , u've' , u'vec' , u'vep' , u'vls' , u'war' , u'wuu' , u'xal' , u'xh' ,
31
+ u'xmf' , u'yo' , u'za' , u'zea' , u'zh-classical' , u'zh-yue' , u'zu' ]
32
+
33
+
34
+
35
+ baselink = ['http://stats.wikimedia.org/EN/TablesWikipedia' , '.htm' ]
36
+
37
+ wiki_links = []
38
+ for w in wiki_langs :
39
+ if '-' in w :
40
+ w = w .replace ('-' ,'_' )
41
+ link = baselink [0 ] + w .upper () + baselink [1 ]
42
+ wiki_links .append ([link , w ])
43
+
44
+ print wiki_links
45
+
46
+ good_links = []
47
+ # check if links are valid (they still could be bad, though)
48
+ for l in wiki_links :
49
+ link = l [0 ]
50
+ try :
51
+ urllib2 .urlopen (link )
52
+ good_links .append (l )
53
+ except urllib2 .HTTPError , e :
54
+ print e .code , link
55
+ except urllib2 .URLError , e :
56
+ print e .args , link
57
+
58
+ wiki_links = good_links
59
+
60
+ #wiki_links = [['http://stats.wikimedia.org/EN/TablesWikipediaEO.htm', 'en', '0']]
61
+
62
+ out = open ('wiki_contributors.txt' , 'wb' )
63
+ writer = csv .DictWriter (out , fieldnames = ['username' , 'edits, articles, 30 dy' , 'edits, other, 30 dy' , 'creates, articles, 30 dy' , 'creates, other, 30 dy' , 'link' , 'lang' ], dialect = 'excel' )
64
+ writer .writeheader ()
65
+
66
+ errors = []
67
+
68
+ for l in wiki_links :
69
+ lang_link = l [0 ]
70
+ page = urllib2 .urlopen (lang_link ).read ()
71
+ soup = BeautifulSoup (page , "html.parser" )
72
+ user_table = soup .find ('table' , id = "table2" )
73
+
74
+ try :
75
+ rows = user_table .findAll ('tr' )[3 :]
76
+ except AttributeError , e :
77
+ print l [1 ], e
78
+ continue
79
+ for r in rows :
80
+ name = r .a .text
81
+ name = name .encode ('utf-8' )
82
+ link = r .a .get ('href' )
83
+ link = link .encode ('utf-8' )
84
+ user_data = r .findAll ('td' , { "class" : "rbg" })
85
+ user_data = [x .text for x in user_data ]
86
+ try :
87
+ writer .writerow ({'username' : name , 'edits, articles, 30 dy' : user_data [0 ], 'edits, other, 30 dy' : user_data [1 ], 'creates, articles, 30 dy' : user_data [2 ], 'creates, other, 30 dy' : user_data [3 ], 'link' : link , 'lang' : l [1 ]})
88
+ except UnicodeEncodeError :
89
+ errors .append ([name , user_data [0 ], user_data [1 ], user_data [2 ], user_data [3 ], link , l [1 ]])
90
+ except IndexError :
91
+ print name , user_data , link
92
+
93
+ out .close ()
94
+
95
+ #for e in errors:
96
+ # print e[0]
97
+ #print '*******************************'
98
+ #for e in errors:
99
+ # print e[1]
100
+ #print '*******************************'
101
+ #for e in errors:
102
+ # print e[2]
103
+ #print '*******************************'
104
+ #for e in errors:
105
+ # print e[3]
106
+ #print '*******************************'
107
+ #for e in errors:
108
+ # print e[4]
109
+ #print '*******************************'
110
+ #for e in errors:
111
+ # print e[5]
112
+ #print '*******************************'
113
+ #for e in errors:
114
+ # print e[6]
115
+
116
+
0 commit comments