-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfreebase2wikipedia.py
97 lines (86 loc) · 2.93 KB
/
freebase2wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Author: Pedro Saleiro ([email protected])
"""
import sys, os, gzip
from datetime import datetime
from parser import WikipediaParser
def ReadFreebase(handler, filename, auxfile, block_len, languages):
parser = WikipediaParser(filename, languages, auxfile)
line_num = 0
m = 0
_buff = ''
dd = datetime.today().strftime("%d/%m - %H:%M:%S")
print("%s : Reader is starting" % dd)
while True:
buff = handler.read(block_len)
lines = (_buff + buff).split('\n')
parser.find_and_write(lines[:-1])
line_num += len(lines[:-1])
if line_num / 10000000 == m:
dd = datetime.today().strftime("%d/%m - %H:%M:%S")
print("%s : Line %d" % (dd, line_num))
m += 1
if not buff:
parser.handler.close()
break
_buff = lines[-1]
dd = datetime.today().strftime("%d/%m - %H:%M:%S")
print("%s : Reader is done : %d lines" % (dd, line_num))
def CreatingTSV(auxfile):
print "freebase to wikipedia"
maps = {}
mid = ''
dic = {}
count = 1
with gzip.open(auxfile, 'rt') as dump:
for line in dump:
count += 1
if count % 1000000 == 0:
print count
attr = line.strip('\n').split('\t')
if attr[0] != mid:
if len(dic) > 1:
mid = '/' + mid.replace('.','/')
maps[mid] = dic
#print mid, maps[mid]
dic = {}
mid = attr[0]
if attr[1] == 'en_wikipedia_link':
if attr[2].startswith('index.html'):
dic['curid'] = attr[2].replace('index.html?curid=','')
else:
dic['en_wikipedia'] = attr[2]
else:
dic[attr[1]] = attr[2]
Flush(auxfile,maps)
return None
def Flush(auxfile, maps):
print 'Flushing tsv file of length ', len(maps)
with open('mid2wikipedia.tsv','w') as out:
out.write('mid\ten_name\ten_wikipedia\tcurid\n')
for key in maps:
out.write(key+'\t')
try:
out.write(maps[key]['en_name']+'\t')
except KeyError:
out.write('-\t')
try:
out.write(maps[key]['en_wikipedia']+'\t')
except KeyError:
out.write('-\t')
try:
out.write(maps[key]['curid']+'\n')
except KeyError:
out.write('-\n')
if __name__ == '__main__':
freebase_path = sys.argv[1]
basename = os.path.basename(freebase_path)
auxfile = 'freebase_wikipedia_dump.gz'
dump = gzip.open(freebase_path, 'rt')
languages = ['en']
block_len = 1024 * 1024
ReadFreebase(dump, dump.name, aux_file, block_len, languages)
dump.close()
CreatingTSV(auxfile)