final evaluation

loonghch · Jun 11, 2017 · 0afdae6 · 0afdae6
1 parent 8df41be
commit 0afdae6
Show file tree

Hide file tree

Showing 8 changed files with 624 additions and 146 deletions.
diff --git a/Evaluate.ipynb b/Evaluate.ipynb
diff --git a/grammar/chunker.py b/grammar/chunker.py
@@ -0,0 +1,21 @@
+def strip_non_ascii(string):
+    ''' Returns the string without non ASCII characters'''
+    stripped = (c for c in string if 0 < ord(c) < 127)
+    return ''.join(stripped)
+
+import os
+
+
+mode = "dev"
+
+h = open("joinedfiledev.txt", "w+")
+count = 0
+for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
+    with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
+        doc = g.read()
+        words = doc.split(" ")
+        for i in range(0, len(words)/50 + 1):
+            chunk = " ".join(words[50*i:50*i+50])
+            h.write(f + " " + str(count) + "\n" + chunk + "\n")
+            count += 1
+h.close()
diff --git a/grammar/feature_count_generator.py b/grammar/feature_count_generator.py
@@ -0,0 +1,110 @@
+mode = "train"
+maxtot = 2
+d = {}
+d["1"] = 0
+d["2"] = 0
+d["3"] = 0
+d["4"] = 0
+d["5"] = 0
+d["6"] = 0
+d["7"] = 0
+d["8"] = 0
+d["9"] = 0
+d["10"] = 0
+d["11"] = 0
+d["12"] = 0
+d["13"] = 0
+d["14"] = 0
+d["15"] = 0
+d["16"] = 0
+d["17"] = 0
+d["18"] = 0
+d["19"] = 0
+d["20"] = 0
+d["21"] = 0
+d["22"] = 0
+d["23"] = 0
+d["24"] = 0
+d["25"] = 0
+d["26"] = 1
+d["27"] = 0
+d["28"] = 0
+d["29"] = 0
+d["30"] = 0
+d["31"] = 0
+d["32"] = 0
+d["33"] = 0
+d["34"] = 0
+d["35"] = 0
+d["36"] = 0
+d["37"] = 0
+d["38"] = 0
+
+d["ARA"] = 0
+d["CHI"] = 1
+d["FRE"] = 2
+d["GER"] = 3
+d["HIN"] = 4
+d["ITA"] = 5
+d["JPN"] = 6
+d["KOR"] = 7
+d["SPA"] = 8
+d["TEL"] = 9
+d["TUR"] = 10
+f = open(mode+'.tagged_data.txt', "r")
+lines = f.readlines()
+unvectors = []
+nvectors = []
+y = 0
+maxlen = 0
+totals = []
+for line in lines:
+    count = [0]*maxtot
+    if line == "\n":
+        unvectors.append([str(x) for x in count])
+        nvectors.append([str(x) for x in ncount])
+        totals.append(0.0)
+        continue
+    tags = [x.strip() for x in line.split(',')]
+    for tag in tags:
+        count[d[tag]] += 1
+    total = len(tags)
+    if total > maxlen:
+        maxlen = total
+    ncount = [float(x) / total for x in count]
+    totals.append(float(total))
+    unvectors.append([str(x) for x in count])
+    nvectors.append([str(x) for x in ncount])
+    y += 1
+f.close()
+for i in range(len(totals)):
+    nvectors[i].append(str(totals[i]/maxlen))
+
+g = open(mode+'_normalized.txt', "w+")
+for nvector in nvectors:
+    s = ",".join(nvector) + "\n"
+    g.write(s)
+g.close()
+h = open(mode+'_unnormalized.txt', "w+")
+for vector in unvectors:
+    s = ",".join(vector) + "\n"
+    h.write(s)
+h.close()
+
+res = {}
+if mode == "train":
+    x = open("labels_train.txt", "r")
+    lines = x.readlines()
+    for i in range(len(lines)):
+        line = lines[i].strip()
+        if line in res:
+            res[line] = [int(y) + int(z) for (y, z) in zip(res[line], unvectors[i])]           
+        else:
+            res[line] = unvectors[i]
+    x.close()
+    y = open("counts_train.txt", "w+")
+    for k in res:
+        line = [str(x) for x in res[k]]
+        s = k + "\n" + ",".join(line) + "\n"
+        y.write(s)
+    y.close()
diff --git a/grammar/final_gen.py b/grammar/final_gen.py
@@ -0,0 +1,44 @@
+def strip_non_ascii(string):
+    ''' Returns the string without non ASCII characters'''
+    stripped = (c for c in string if 0 < ord(c) < 127)
+    return ''.join(stripped)
+
+from operator import add
+import os
+mode = "dev"
+
+
+h = open("Feature Pair Dev.txt", "r")
+count = 0
+files = os.listdir("../data/speech_transcriptions/" + mode + "/original")
+od = {}
+o = open("outputfiledev.txt", "r")
+lines = o.readlines()
+i = 0
+while i < len(lines):
+    line1 = lines[i]
+    line2 = lines[i+1]
+    vals = line2.split(",")
+    od[line1] = [int(val) for val in vals]
+    i+=2
+print(len(od.keys()))
+hlines = h.readlines()
+hd = {}
+for line in hlines:
+    pair = line.split(" ")
+    p1 = pair[0]
+    p2 = pair[1]
+    if p1 in hd:
+        hd[p1] = map(add, hd[p1], od[p2])
+    else:
+        hd[p1] = od[p2]
+o.close()
+h.close()
+last = open("dev_gram_features.txt", "w+")
+for f in files:
+    if f in hd:
+        n = [str(val) for val in hd[f]]
+        last.write(",".join(n) + "\n")
+    else:
+        last.write("0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n")
+last.close()
diff --git a/grammar/mod.py b/grammar/mod.py
@@ -0,0 +1,20 @@
+f = open("temprac.txt", "r")
+h = open("tempdevf.txt", "w+")
+lines = f.readlines()
+prev = False
+prev2 = False
+prev2Line = ""
+prevLine = ""
+print(len(lines))
+for line in lines:
+    words = line.split(" ")
+    if(words[0].endswith(".txt")):
+        if(not(prev)):
+            h.write(prevLine)
+        prev = True
+    else:
+        h.write(prevLine)
+        prev = False
+    prevLine = line
+h.close()
+f.close()
diff --git a/grammar/tag_count.py b/grammar/tag_count.py
@@ -0,0 +1,12 @@
+f = open('train.tagged_data.txt', "r")
+lines = f.readlines()
+count = [0]*38
+for line in lines:
+
+    if line == "\n":
+        continue
+    tags = [x.strip() for x in line.split(',')]
+    for tag in tags:
+        count[int(tag)-1] += 1
+f.close()
+print count
diff --git a/grammar/tree_gen.py b/grammar/tree_gen.py
@@ -0,0 +1,81 @@
+def strip_non_ascii(string):
+    ''' Returns the string without non ASCII characters'''
+    stripped = (c for c in string if 0 < ord(c) < 127)
+    return ''.join(stripped)
+
+import os
+from nltk.parse import stanford
+mode = "train"
+os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar"
+os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
+os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/"
+os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar"
+parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz")
+
+docs = []
+dmap = {}
+fmap = {}
+count = 0
+dcount = 0
+for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
+    with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
+        doc = g.read()
+        words = doc.split(" ")
+        for i in range(0, len(words)/50 + 1):
+            chunk = " ".join(words[50*i:50*i+50])
+            docs.append(strip_non_ascii(chunk))
+            dmap[count] = dcount
+            fmap[count] = f
+            count += 1
+        dcount +=1
+        if(dcount%1000 == 999):
+            print count
+
+sentences = parser.raw_parse_sents(docs)
+x = [y for y in sentences]
+print(len(x))
+d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass']
+g = {}
+w = {}
+nmap = {}
+dgs = []
+for i in range(len(x)):
+    z = [a for a in x[i]]
+    dgs.append(z)
+    c = list(z[0].triples())
+    words = [a['word'] for a in z[0].nodes.values()]
+    fwords = []
+    for word in words:
+        if type(u'I') == type(word):
+            fwords.append(word)
+    finstring = " ".join(fwords)
+    if dmap[i] in g:
+        l = g[dmap[i]]
+        q = w[dmap[i]]
+    else:
+        g[dmap[i]] = {}
+        w[dmap[i]] = ""
+        nmap[dmap[i]] = fmap[i]
+        l = g[dmap[i]]
+        q = w[dmap[i]]
+    w[dmap[i]] += " " + finstring
+    for j in c:
+        if j[1] not in d:
+            d.append(j[1])
+        if j[1] not in l:
+            l[j[1]] = 1
+        else:
+            l[j[1]] += 1
+
+h = open(mode + "_dep.txt", "w+")
+k = sorted(g.keys())
+print(len(k))
+for key in k:
+    out = []
+    for val in d:
+        if val in g[key]:
+            out.append(str(g[key][val]))
+        else:
+            out.append("0")
+    h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n")
+h.close()
diff --git a/grammar/tree_gen_dev.py b/grammar/tree_gen_dev.py
@@ -0,0 +1,81 @@
+def strip_non_ascii(string):
+    ''' Returns the string without non ASCII characters'''
+    stripped = (c for c in string if 0 < ord(c) < 127)
+    return ''.join(stripped)
+
+import os
+from nltk.parse import stanford
+mode = "dev"
+os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar"
+os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
+os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/"
+os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar"
+parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz")
+
+docs = []
+dmap = {}
+fmap = {}
+count = 0
+dcount = 0
+for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"):
+    with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g:
+        doc = g.read()
+        words = doc.split(" ")
+        for i in range(0, len(words)/50 + 1):
+            chunk = " ".join(words[50*i:50*i+50])
+            docs.append(strip_non_ascii(chunk))
+            dmap[count] = dcount
+            fmap[count] = f
+            count += 1
+        dcount +=1
+        if(dcount%1000 == 999):
+            print count
+
+sentences = parser.raw_parse_sents(docs)
+x = [y for y in sentences]
+print(len(x))
+d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass']
+g = {}
+w = {}
+nmap = {}
+dgs = []
+for i in range(len(x)):
+    z = [a for a in x[i]]
+    dgs.append(z)
+    c = list(z[0].triples())
+    words = [a['word'] for a in z[0].nodes.values()]
+    fwords = []
+    for word in words:
+        if type(u'I') == type(word):
+            fwords.append(word)
+    finstring = " ".join(fwords)
+    if dmap[i] in g:
+        l = g[dmap[i]]
+        q = w[dmap[i]]
+    else:
+        g[dmap[i]] = {}
+        w[dmap[i]] = ""
+        nmap[dmap[i]] = fmap[i]
+        l = g[dmap[i]]
+        q = w[dmap[i]]
+    w[dmap[i]] += " " + finstring
+    for j in c:
+        if j[1] not in d:
+            d.append(j[1])
+        if j[1] not in l:
+            l[j[1]] = 1
+        else:
+            l[j[1]] += 1
+
+h = open(mode + "_dep.txt", "w+")
+k = sorted(g.keys())
+print(len(k))
+for key in k:
+    out = []
+    for val in d:
+        if val in g[key]:
+            out.append(str(g[key][val]))
+        else:
+            out.append("0")
+    h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n")
+h.close()