-
Notifications
You must be signed in to change notification settings - Fork 0
/
Processtext.py
97 lines (85 loc) · 2.95 KB
/
Processtext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
"""
Processes the text converted form of each pdf, converting each of them into a list, so that the classification task becomes easy
"""
portionPath = "Papers/Text/aieee-syllabus-2012.pdf.txt"
def WriteDictToFile(fileDict):
for fileName,content in fileDict.iteritems():
file = open(fileName, "w")
if type(content)==list:
file.write("\n".join(content))
elif type(content)==None:
pass
elif type(content)==dict:
file.write(str(content))
print len(content)
else:
file.write(content)
file.close()
#In the PDf read, there is no space between unit name and first topic.
#Replace the same using this function
def SeparateWhiteSpace(str,toReplace=None):
noOfCharToReplace=1
if not toReplace:
toReplace=re.findall("[A-Z][a-z]+[A-Z]",str)
else: #Physics paper
toReplace=re.findall("[A-Z][\s]*[A-Z]+[a-z]",str)
noOfCharToReplace = 2
#toReplace= [text[:len(text)] for text in toReplace ]
#print toReplace
for text in toReplace:
#Replacement for the word without space
replacedText = text[:len(text)-noOfCharToReplace]+":::"+text[len(text)-noOfCharToReplace]
#print replacedText
str = re.sub(text,replacedText,str)
#print str
return str
#Returns portions for each subject as a list, split based on units
def getPortionDictList(portionDict):
splitPortDict={}
for subject,portion in portionDict.iteritems():
port = re.split("UNIT[\s]+[0-9]+:",portion)
if port:
splitPortDict[subject+"_split"]=port
#Just for testing, To be removed
WriteDictToFile(splitPortDict)
return splitPortDict
#Does basic removal of whitespaces and separation of portion for each subject
def findPortion():
string = open(portionPath,"r").read()
string = re.sub(r'\([^\)]+\)',' ',string)
unformattedList = re.split("MATHEMATICS",string)
#Remove unwanted stuff before Math
unformattedList = re.split("PHYSICS",unformattedList[1])
#First part of remaining is math
math = unformattedList[0]
unformattedList = re.split("CHEMISTRY",unformattedList[1])
physics = unformattedList[0]
chemistry = unformattedList[1].split("SYLLABUS")[0]
portions={}
portions["math"]=SeparateWhiteSpace(math)
portions["physics"]=SeparateWhiteSpace(physics,"replace")#To denote replacement has to be done
portions["chemistry"]=SeparateWhiteSpace(chemistry)
#Just for testing, To be removed
WriteDictToFile(portions)
return portions
if __name__=="__main__":
finalPortions={}
portionDict = findPortion()
portionDict = getPortionDictList(portionDict)
for subject,portionList in portionDict.iteritems():
unit={}
for unitPortions in portionList:
if not unitPortions:
continue
tempList = unitPortions.split(":::")
unitName = tempList[0]
if len(tempList)<2:#TEsting. Won't work for physics
continue
topicList = re.sub('[\.;]',',',tempList[1])
splitTopic = topicList.split(',') #The topics in the unit are split
for topic in splitTopic:
if topic:
unit[topic] = unitName
finalPortions[subject+"_test"] = unit
WriteDictToFile(finalPortions)