-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdsplit.py
67 lines (49 loc) · 1.61 KB
/
dsplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import os
import bz2
import sys
# In[6]:
def split_xml(filename,pages_per_chunk,num_chunks_to_process=False):
''' The function gets the filename of wiktionary.xml.bz2 file as input and creates
smallers chunks of it in a the diretory chunks
'''
# Read line by line
try:
bzfile = bz2.BZ2File(filename)
except:
print('Couldnt open source file')
return
# Check and create chunk diretory
# if not os.path.exists("chunks"):
# os.mkdir("chunks")
# Counters
pagecount = 0
filecount = 1
#open chunkfile in write mode
chunkname = lambda filecount: os.path.join("chunksof1lpages",filename.split('/')[2][:-8]+'-'+str(filecount)+".xml.bz2")
chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
for line in bzfile:
chunkfile.write(line)
# the </page> determines new wiki page
if b'</page>' in line:
pagecount += 1
if pagecount > pages_per_chunk:
#print chunkname() # For Debugging
chunkfile.close()
if num_chunks_to_process and filecount >= num_chunks_to_process:
return
pagecount = 0 # RESET pagecount
filecount += 1 # increment filename
chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
try:
chunkfile.close()
except:
print("Files already close")
# In[11]:
if __name__ == '__main__':
# print(sys.argv[1])
split_xml(sys.argv[1],100000)
# split_xml('enwiki-20210720-pages-articles-multistream.xml.bz2',500000)
# In[ ]: