-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Long-Huei Chen
committed
Jun 11, 2017
1 parent
8df41be
commit 0afdae6
Showing
8 changed files
with
624 additions
and
146 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
def strip_non_ascii(string): | ||
''' Returns the string without non ASCII characters''' | ||
stripped = (c for c in string if 0 < ord(c) < 127) | ||
return ''.join(stripped) | ||
|
||
import os | ||
|
||
|
||
mode = "dev" | ||
|
||
h = open("joinedfiledev.txt", "w+") | ||
count = 0 | ||
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"): | ||
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g: | ||
doc = g.read() | ||
words = doc.split(" ") | ||
for i in range(0, len(words)/50 + 1): | ||
chunk = " ".join(words[50*i:50*i+50]) | ||
h.write(f + " " + str(count) + "\n" + chunk + "\n") | ||
count += 1 | ||
h.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
mode = "train" | ||
maxtot = 2 | ||
d = {} | ||
d["1"] = 0 | ||
d["2"] = 0 | ||
d["3"] = 0 | ||
d["4"] = 0 | ||
d["5"] = 0 | ||
d["6"] = 0 | ||
d["7"] = 0 | ||
d["8"] = 0 | ||
d["9"] = 0 | ||
d["10"] = 0 | ||
d["11"] = 0 | ||
d["12"] = 0 | ||
d["13"] = 0 | ||
d["14"] = 0 | ||
d["15"] = 0 | ||
d["16"] = 0 | ||
d["17"] = 0 | ||
d["18"] = 0 | ||
d["19"] = 0 | ||
d["20"] = 0 | ||
d["21"] = 0 | ||
d["22"] = 0 | ||
d["23"] = 0 | ||
d["24"] = 0 | ||
d["25"] = 0 | ||
d["26"] = 1 | ||
d["27"] = 0 | ||
d["28"] = 0 | ||
d["29"] = 0 | ||
d["30"] = 0 | ||
d["31"] = 0 | ||
d["32"] = 0 | ||
d["33"] = 0 | ||
d["34"] = 0 | ||
d["35"] = 0 | ||
d["36"] = 0 | ||
d["37"] = 0 | ||
d["38"] = 0 | ||
|
||
d["ARA"] = 0 | ||
d["CHI"] = 1 | ||
d["FRE"] = 2 | ||
d["GER"] = 3 | ||
d["HIN"] = 4 | ||
d["ITA"] = 5 | ||
d["JPN"] = 6 | ||
d["KOR"] = 7 | ||
d["SPA"] = 8 | ||
d["TEL"] = 9 | ||
d["TUR"] = 10 | ||
f = open(mode+'.tagged_data.txt', "r") | ||
lines = f.readlines() | ||
unvectors = [] | ||
nvectors = [] | ||
y = 0 | ||
maxlen = 0 | ||
totals = [] | ||
for line in lines: | ||
count = [0]*maxtot | ||
if line == "\n": | ||
unvectors.append([str(x) for x in count]) | ||
nvectors.append([str(x) for x in ncount]) | ||
totals.append(0.0) | ||
continue | ||
tags = [x.strip() for x in line.split(',')] | ||
for tag in tags: | ||
count[d[tag]] += 1 | ||
total = len(tags) | ||
if total > maxlen: | ||
maxlen = total | ||
ncount = [float(x) / total for x in count] | ||
totals.append(float(total)) | ||
unvectors.append([str(x) for x in count]) | ||
nvectors.append([str(x) for x in ncount]) | ||
y += 1 | ||
f.close() | ||
for i in range(len(totals)): | ||
nvectors[i].append(str(totals[i]/maxlen)) | ||
|
||
g = open(mode+'_normalized.txt', "w+") | ||
for nvector in nvectors: | ||
s = ",".join(nvector) + "\n" | ||
g.write(s) | ||
g.close() | ||
h = open(mode+'_unnormalized.txt', "w+") | ||
for vector in unvectors: | ||
s = ",".join(vector) + "\n" | ||
h.write(s) | ||
h.close() | ||
|
||
res = {} | ||
if mode == "train": | ||
x = open("labels_train.txt", "r") | ||
lines = x.readlines() | ||
for i in range(len(lines)): | ||
line = lines[i].strip() | ||
if line in res: | ||
res[line] = [int(y) + int(z) for (y, z) in zip(res[line], unvectors[i])] | ||
else: | ||
res[line] = unvectors[i] | ||
x.close() | ||
y = open("counts_train.txt", "w+") | ||
for k in res: | ||
line = [str(x) for x in res[k]] | ||
s = k + "\n" + ",".join(line) + "\n" | ||
y.write(s) | ||
y.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
def strip_non_ascii(string): | ||
''' Returns the string without non ASCII characters''' | ||
stripped = (c for c in string if 0 < ord(c) < 127) | ||
return ''.join(stripped) | ||
|
||
from operator import add | ||
import os | ||
mode = "dev" | ||
|
||
|
||
h = open("Feature Pair Dev.txt", "r") | ||
count = 0 | ||
files = os.listdir("../data/speech_transcriptions/" + mode + "/original") | ||
od = {} | ||
o = open("outputfiledev.txt", "r") | ||
lines = o.readlines() | ||
i = 0 | ||
while i < len(lines): | ||
line1 = lines[i] | ||
line2 = lines[i+1] | ||
vals = line2.split(",") | ||
od[line1] = [int(val) for val in vals] | ||
i+=2 | ||
print(len(od.keys())) | ||
hlines = h.readlines() | ||
hd = {} | ||
for line in hlines: | ||
pair = line.split(" ") | ||
p1 = pair[0] | ||
p2 = pair[1] | ||
if p1 in hd: | ||
hd[p1] = map(add, hd[p1], od[p2]) | ||
else: | ||
hd[p1] = od[p2] | ||
o.close() | ||
h.close() | ||
last = open("dev_gram_features.txt", "w+") | ||
for f in files: | ||
if f in hd: | ||
n = [str(val) for val in hd[f]] | ||
last.write(",".join(n) + "\n") | ||
else: | ||
last.write("0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n") | ||
last.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
f = open("temprac.txt", "r") | ||
h = open("tempdevf.txt", "w+") | ||
lines = f.readlines() | ||
prev = False | ||
prev2 = False | ||
prev2Line = "" | ||
prevLine = "" | ||
print(len(lines)) | ||
for line in lines: | ||
words = line.split(" ") | ||
if(words[0].endswith(".txt")): | ||
if(not(prev)): | ||
h.write(prevLine) | ||
prev = True | ||
else: | ||
h.write(prevLine) | ||
prev = False | ||
prevLine = line | ||
h.close() | ||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
f = open('train.tagged_data.txt', "r") | ||
lines = f.readlines() | ||
count = [0]*38 | ||
for line in lines: | ||
|
||
if line == "\n": | ||
continue | ||
tags = [x.strip() for x in line.split(',')] | ||
for tag in tags: | ||
count[int(tag)-1] += 1 | ||
f.close() | ||
print count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
def strip_non_ascii(string): | ||
''' Returns the string without non ASCII characters''' | ||
stripped = (c for c in string if 0 < ord(c) < 127) | ||
return ''.join(stripped) | ||
|
||
import os | ||
from nltk.parse import stanford | ||
mode = "train" | ||
os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar" | ||
os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" | ||
os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/" | ||
os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar" | ||
parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz") | ||
|
||
docs = [] | ||
dmap = {} | ||
fmap = {} | ||
count = 0 | ||
dcount = 0 | ||
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"): | ||
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g: | ||
doc = g.read() | ||
words = doc.split(" ") | ||
for i in range(0, len(words)/50 + 1): | ||
chunk = " ".join(words[50*i:50*i+50]) | ||
docs.append(strip_non_ascii(chunk)) | ||
dmap[count] = dcount | ||
fmap[count] = f | ||
count += 1 | ||
dcount +=1 | ||
if(dcount%1000 == 999): | ||
print count | ||
|
||
sentences = parser.raw_parse_sents(docs) | ||
x = [y for y in sentences] | ||
print(len(x)) | ||
d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass'] | ||
g = {} | ||
w = {} | ||
nmap = {} | ||
dgs = [] | ||
for i in range(len(x)): | ||
z = [a for a in x[i]] | ||
dgs.append(z) | ||
c = list(z[0].triples()) | ||
words = [a['word'] for a in z[0].nodes.values()] | ||
fwords = [] | ||
for word in words: | ||
if type(u'I') == type(word): | ||
fwords.append(word) | ||
finstring = " ".join(fwords) | ||
if dmap[i] in g: | ||
l = g[dmap[i]] | ||
q = w[dmap[i]] | ||
else: | ||
g[dmap[i]] = {} | ||
w[dmap[i]] = "" | ||
nmap[dmap[i]] = fmap[i] | ||
l = g[dmap[i]] | ||
q = w[dmap[i]] | ||
w[dmap[i]] += " " + finstring | ||
for j in c: | ||
if j[1] not in d: | ||
d.append(j[1]) | ||
if j[1] not in l: | ||
l[j[1]] = 1 | ||
else: | ||
l[j[1]] += 1 | ||
|
||
h = open(mode + "_dep.txt", "w+") | ||
k = sorted(g.keys()) | ||
print(len(k)) | ||
for key in k: | ||
out = [] | ||
for val in d: | ||
if val in g[key]: | ||
out.append(str(g[key][val])) | ||
else: | ||
out.append("0") | ||
h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n") | ||
h.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
def strip_non_ascii(string): | ||
''' Returns the string without non ASCII characters''' | ||
stripped = (c for c in string if 0 < ord(c) < 127) | ||
return ''.join(stripped) | ||
|
||
import os | ||
from nltk.parse import stanford | ||
mode = "dev" | ||
os.environ['STANFORD_PARSER'] = "stanford-parser-full-2016-10-31/stanford-parser.jar" | ||
os.environ['STANFORD_MODELS'] = "stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" | ||
os.environ['JAVA_HOME'] = "C:/Program Files/Java/jre1.8.0_102/bin/" | ||
os.environ['STANFORD_CORENLP'] = "stanford-parser-full-2016-10-31/stanford-english-corenlp-2016-10-31-models.jar" | ||
parser = stanford.StanfordDependencyParser("stanford-parser-full-2016-10-31/englishPCFG.ser.gz") | ||
|
||
docs = [] | ||
dmap = {} | ||
fmap = {} | ||
count = 0 | ||
dcount = 0 | ||
for f in os.listdir("../data/speech_transcriptions/"+ mode + "/original"): | ||
with open("../data/speech_transcriptions/" + mode + "/original/"+f, "r") as g: | ||
doc = g.read() | ||
words = doc.split(" ") | ||
for i in range(0, len(words)/50 + 1): | ||
chunk = " ".join(words[50*i:50*i+50]) | ||
docs.append(strip_non_ascii(chunk)) | ||
dmap[count] = dcount | ||
fmap[count] = f | ||
count += 1 | ||
dcount +=1 | ||
if(dcount%1000 == 999): | ||
print count | ||
|
||
sentences = parser.raw_parse_sents(docs) | ||
x = [y for y in sentences] | ||
print(len(x)) | ||
d = [u'nsubj', u'det', u'acl:relcl', u'dep', u'advmod', u'cc', u'conj', u'cop', u'compound', u'appos', u'nmod', u'case', u'dobj', u'mark', u'aux', u'amod', u'nmod:npmod', u'nummod', u'xcomp', u'discourse', u'advcl', u'nmod:poss', u'acl', u'nsubjpass', u'auxpass', u'ccomp', u'mwe', u'parataxis', u'neg', u'csubj', u'det:predet', u'expl', u'compound:prt', u'iobj', u'nmod:tmod', u'cc:preconj', u'csubjpass'] | ||
g = {} | ||
w = {} | ||
nmap = {} | ||
dgs = [] | ||
for i in range(len(x)): | ||
z = [a for a in x[i]] | ||
dgs.append(z) | ||
c = list(z[0].triples()) | ||
words = [a['word'] for a in z[0].nodes.values()] | ||
fwords = [] | ||
for word in words: | ||
if type(u'I') == type(word): | ||
fwords.append(word) | ||
finstring = " ".join(fwords) | ||
if dmap[i] in g: | ||
l = g[dmap[i]] | ||
q = w[dmap[i]] | ||
else: | ||
g[dmap[i]] = {} | ||
w[dmap[i]] = "" | ||
nmap[dmap[i]] = fmap[i] | ||
l = g[dmap[i]] | ||
q = w[dmap[i]] | ||
w[dmap[i]] += " " + finstring | ||
for j in c: | ||
if j[1] not in d: | ||
d.append(j[1]) | ||
if j[1] not in l: | ||
l[j[1]] = 1 | ||
else: | ||
l[j[1]] += 1 | ||
|
||
h = open(mode + "_dep.txt", "w+") | ||
k = sorted(g.keys()) | ||
print(len(k)) | ||
for key in k: | ||
out = [] | ||
for val in d: | ||
if val in g[key]: | ||
out.append(str(g[key][val])) | ||
else: | ||
out.append("0") | ||
h.write(str(key) + "\n" + w[key] + "\n" + ",".join(out) + "\n") | ||
h.close() |