Skip to content

Commit

Permalink
added unicode filter hack
Browse files Browse the repository at this point in the history
  • Loading branch information
madmaze committed May 10, 2012
1 parent 485afed commit 0c07000
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
21 changes: 16 additions & 5 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self,inputDir,langs,processed):
self.langDirs=langs
self.procDir=processed
self.enVecs = fVectors.fVectors("en")
self.esVecs = fVectors.fVectors("es")
self.esVecs = fVectors.fVectors("de")

def runMCCA(self):
print "in MCCA..."
Expand Down Expand Up @@ -149,7 +149,16 @@ def cleanText(self,inText):
# strip out whatever we need here
p = re.compile('[,]')
outText = p.sub('',inText)
return outText
out=""
print "Sanitizing input data..."
for w in outText.split(" "):
containsUnknown=0
for l in w:
if ord(l) > 128:
containsUnknown=1
if containsUnknown == 0:
out += " "+w
return out

def saveProcessed(self,data,fname):
fout = open(fname+".processed", "w")
Expand All @@ -163,9 +172,10 @@ def processFile(self,fname):
f = open(fname, "r")
lines=""
for line in f.readlines():
#print line.strip()
lines+=line.strip()+" "

lines = self.cleanText(lines)
lines = unicode(self.cleanText(lines))
tmp=[]

wordpunct_tokenize(lines)
Expand All @@ -183,11 +193,12 @@ def processFile(self,fname):
tmp = lines
lines = tmp[:]
tmp = []


print "stemming... "
stemmer = SnowballStemmer("english")
for s in lines:
for w in s:
print w
#print w
tmp.append(stemmer.stem(w))
# print tmp
return tmp[:]
2 changes: 1 addition & 1 deletion mcca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
optparser = optparse.OptionParser()
optparser.add_option("-i", "--dataDir", dest="datadir", default="DATA", type="string", help="Dir for lang input files (default='DATA')")
optparser.add_option("-o", "--procDir", dest="procDir", default="PROCESSED", type="string", help="Dir for storing Processing files (default='PROCESSED')")
optparser.add_option("-l", "--langs", dest="langs", default="en,es", type="string", help="Languages (default='en,es')")
optparser.add_option("-l", "--langs", dest="langs", default="en,de", type="string", help="Languages (default='en,es')")
optparser.add_option("-p", "--preprocess", action="store_true", dest="preproc_flag", default=False, help="Run preprocessing")
optparser.add_option("-r", "--run", action="store_true", dest="run_flag", default=False, help="Run MCCA")
optparser.add_option("-g", "--genVectors", action="store_true", dest="genV_flag", default=False, help="Generate Vectors")
Expand Down

0 comments on commit 0c07000

Please sign in to comment.