Skip to content

Commit

Permalink
final evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
Long-Huei Chen committed Jun 11, 2017
1 parent 8df41be commit 0afdae6
Show file tree
Hide file tree
Showing 8 changed files with 624 additions and 146 deletions.
401 changes: 255 additions & 146 deletions Evaluate.ipynb

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions grammar/chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
def strip_non_ascii(string):
''' Returns the string without non ASCII characters'''
stripped = (c for c in string if 0 < ord(c) < 127)
return ''.join(stripped)

import os


mode = "dev"

h = open("joinedfiledev.txt", "w+")
count = 0
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
doc = g.read()
words = doc.split(" ")
for i in range(0, len(words)/50 + 1):
chunk = " ".join(words[50*i:50*i+50])
h.write(f + " " + str(count) + "\n" + chunk + "\n")
count += 1
h.close()
110 changes: 110 additions & 0 deletions grammar/feature_count_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
mode = "train"
maxtot = 2
d = {}
d["1"] = 0
d["2"] = 0
d["3"] = 0
d["4"] = 0
d["5"] = 0
d["6"] = 0
d["7"] = 0
d["8"] = 0
d["9"] = 0
d["10"] = 0
d["11"] = 0
d["12"] = 0
d["13"] = 0
d["14"] = 0
d["15"] = 0
d["16"] = 0
d["17"] = 0
d["18"] = 0
d["19"] = 0
d["20"] = 0
d["21"] = 0
d["22"] = 0
d["23"] = 0
d["24"] = 0
d["25"] = 0
d["26"] = 1
d["27"] = 0
d["28"] = 0
d["29"] = 0
d["30"] = 0
d["31"] = 0
d["32"] = 0
d["33"] = 0
d["34"] = 0
d["35"] = 0
d["36"] = 0
d["37"] = 0
d["38"] = 0

d["ARA"] = 0
d["CHI"] = 1
d["FRE"] = 2
d["GER"] = 3
d["HIN"] = 4
d["ITA"] = 5
d["JPN"] = 6
d["KOR"] = 7
d["SPA"] = 8
d["TEL"] = 9
d["TUR"] = 10
f = open(mode+'.tagged_data.txt', "r")
lines = f.readlines()
unvectors = []
nvectors = []
y = 0
maxlen = 0
totals = []
for line in lines:
count = [0]*maxtot
if line == "\n":
unvectors.append([str(x) for x in count])
nvectors.append([str(x) for x in ncount])
totals.append(0.0)
continue
tags = [x.strip() for x in line.split(',')]
for tag in tags:
count[d[tag]] += 1
total = len(tags)
if total > maxlen:
maxlen = total
ncount = [float(x) / total for x in count]
totals.append(float(total))
unvectors.append([str(x) for x in count])
nvectors.append([str(x) for x in ncount])
y += 1
f.close()
for i in range(len(totals)):
nvectors[i].append(str(totals[i]/maxlen))

g = open(mode+'_normalized.txt', "w+")
for nvector in nvectors:
s = ",".join(nvector) + "\n"
g.write(s)
g.close()
h = open(mode+'_unnormalized.txt', "w+")
for vector in unvectors:
s = ",".join(vector) + "\n"
h.write(s)
h.close()

res = {}
if mode == "train":
x = open("labels_train.txt", "r")
lines = x.readlines()
for i in range(len(lines)):
line = lines[i].strip()
if line in res:
res[line] = [int(y) + int(z) for (y, z) in zip(res[line], unvectors[i])]
else:
res[line] = unvectors[i]
x.close()
y = open("counts_train.txt", "w+")
for k in res:
line = [str(x) for x in res[k]]
s = k + "\n" + ",".join(line) + "\n"
y.write(s)
y.close()
44 changes: 44 additions & 0 deletions grammar/final_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
def strip_non_ascii(string):
''' Returns the string without non ASCII characters'''
stripped = (c for c in string if 0 < ord(c) < 127)
return ''.join(stripped)

from operator import add
import os
mode = "dev"


h = open("Feature Pair Dev.txt", "r")
count = 0
files = os.listdir("../data/speech_transcriptions/" + mode + "/original")
od = {}
o = open("outputfiledev.txt", "r")
lines = o.readlines()
i = 0
while i < len(lines):
line1 = lines[i]
line2 = lines[i+1]
vals = line2.split(",")
od[line1] = [int(val) for val in vals]
i+=2
print(len(od.keys()))
hlines = h.readlines()
hd = {}
for line in hlines:
pair = line.split(" ")
p1 = pair[0]
p2 = pair[1]
if p1 in hd:
hd[p1] = map(add, hd[p1], od[p2])
else:
hd[p1] = od[p2]
o.close()
h.close()
last = open("dev_gram_features.txt", "w+")
for f in files:
if f in hd:
n = [str(val) for val in hd[f]]
last.write(",".join(n) + "\n")
else:
last.write("0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n")
last.close()
20 changes: 20 additions & 0 deletions grammar/mod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
f = open("temprac.txt", "r")
h = open("tempdevf.txt", "w+")
lines = f.readlines()
prev = False
prev2 = False
prev2Line = ""
prevLine = ""
print(len(lines))
for line in lines:
words = line.split(" ")
if(words[0].endswith(".txt")):
if(not(prev)):
h.write(prevLine)
prev = True
else:
h.write(prevLine)
prev = False
prevLine = line
h.close()
f.close()
12 changes: 12 additions & 0 deletions grammar/tag_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
f = open('train.tagged_data.txt', "r")
lines = f.readlines()
count = [0]*38
for line in lines:

if line == "\n":
continue
tags = [x.strip() for x in line.split(',')]
for tag in tags:
count[int(tag)-1] += 1
f.close()
print count
81 changes: 81 additions & 0 deletions grammar/tree_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
def strip_non_ascii(string):
''' Returns the string without non ASCII characters'''
stripped = (c for c in string if 0 < ord(c) < 127)
return ''.join(stripped)

import os
from nltk.parse import stanford
mode = "train"
os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar"
os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/"
os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar"
parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz")

docs = []
dmap = {}
fmap = {}
count = 0
dcount = 0
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
doc = g.read()
words = doc.split(" ")
for i in range(0, len(words)/50 + 1):
chunk = " ".join(words[50*i:50*i+50])
docs.append(strip_non_ascii(chunk))
dmap[count] = dcount
fmap[count] = f
count += 1
dcount +=1
if(dcount%1000 == 999):
print count

sentences = parser.raw_parse_sents(docs)
x = [y for y in sentences]
print(len(x))
d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass']
g = {}
w = {}
nmap = {}
dgs = []
for i in range(len(x)):
z = [a for a in x[i]]
dgs.append(z)
c = list(z[0].triples())
words = [a['word'] for a in z[0].nodes.values()]
fwords = []
for word in words:
if type(u'I') == type(word):
fwords.append(word)
finstring = " ".join(fwords)
if dmap[i] in g:
l = g[dmap[i]]
q = w[dmap[i]]
else:
g[dmap[i]] = {}
w[dmap[i]] = ""
nmap[dmap[i]] = fmap[i]
l = g[dmap[i]]
q = w[dmap[i]]
w[dmap[i]] += " " + finstring
for j in c:
if j[1] not in d:
d.append(j[1])
if j[1] not in l:
l[j[1]] = 1
else:
l[j[1]] += 1

h = open(mode + "_dep.txt", "w+")
k = sorted(g.keys())
print(len(k))
for key in k:
out = []
for val in d:
if val in g[key]:
out.append(str(g[key][val]))
else:
out.append("0")
h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n")
h.close()
81 changes: 81 additions & 0 deletions grammar/tree_gen_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
def strip_non_ascii(string):
''' Returns the string without non ASCII characters'''
stripped = (c for c in string if 0 < ord(c) < 127)
return ''.join(stripped)

import os
from nltk.parse import stanford
mode = "dev"
os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar"
os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/"
os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar"
parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz")

docs = []
dmap = {}
fmap = {}
count = 0
dcount = 0
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
doc = g.read()
words = doc.split(" ")
for i in range(0, len(words)/50 + 1):
chunk = " ".join(words[50*i:50*i+50])
docs.append(strip_non_ascii(chunk))
dmap[count] = dcount
fmap[count] = f
count += 1
dcount +=1
if(dcount%1000 == 999):
print count

sentences = parser.raw_parse_sents(docs)
x = [y for y in sentences]
print(len(x))
d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass']
g = {}
w = {}
nmap = {}
dgs = []
for i in range(len(x)):
z = [a for a in x[i]]
dgs.append(z)
c = list(z[0].triples())
words = [a['word'] for a in z[0].nodes.values()]
fwords = []
for word in words:
if type(u'I') == type(word):
fwords.append(word)
finstring = " ".join(fwords)
if dmap[i] in g:
l = g[dmap[i]]
q = w[dmap[i]]
else:
g[dmap[i]] = {}
w[dmap[i]] = ""
nmap[dmap[i]] = fmap[i]
l = g[dmap[i]]
q = w[dmap[i]]
w[dmap[i]] += " " + finstring
for j in c:
if j[1] not in d:
d.append(j[1])
if j[1] not in l:
l[j[1]] = 1
else:
l[j[1]] += 1

h = open(mode + "_dep.txt", "w+")
k = sorted(g.keys())
print(len(k))
for key in k:
out = []
for val in d:
if val in g[key]:
out.append(str(g[key][val]))
else:
out.append("0")
h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n")
h.close()

0 comments on commit 0afdae6

Please sign in to comment.