Skip to content

Commit ab09081

Browse files
author
Henry Chen
committed
Update
1 parent 3606a44 commit ab09081

12 files changed

+1036
-1
lines changed

pom.xml

+36-1
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,45 @@
4848
<version>4.10</version>
4949
<scope>test</scope>
5050
</dependency>
51-
<dependency>
51+
<!-- <dependency>
5252
<groupId>edu.emory.mathcs.nlp</groupId>
5353
<artifactId>common</artifactId>
5454
<version>1.0.0-SNAPSHOT</version>
55+
</dependency> -->
56+
<dependency>
57+
<groupId>edu.emory.clir</groupId>
58+
<artifactId>clearnlp</artifactId>
59+
<version>3.1.2</version>
60+
</dependency>
61+
<dependency>
62+
<groupId>edu.emory.clir</groupId>
63+
<artifactId>clearnlp-dictionary</artifactId>
64+
<version>3.1</version>
65+
</dependency>
66+
<dependency>
67+
<groupId>edu.emory.clir</groupId>
68+
<artifactId>clearnlp-global-lexica</artifactId>
69+
<version>3.1</version>
70+
</dependency>
71+
<dependency>
72+
<groupId>edu.emory.clir</groupId>
73+
<artifactId>clearnlp-general-en-pos</artifactId>
74+
<version>3.2</version>
75+
</dependency>
76+
<dependency>
77+
<groupId>edu.emory.clir</groupId>
78+
<artifactId>clearnlp-general-en-dep</artifactId>
79+
<version>3.2</version>
80+
</dependency>
81+
<dependency>
82+
<groupId>edu.emory.clir</groupId>
83+
<artifactId>clearnlp-general-en-ner</artifactId>
84+
<version>3.1</version>
85+
</dependency>
86+
<dependency>
87+
<groupId>edu.emory.clir</groupId>
88+
<artifactId>clearnlp-general-en-ner-gazetteer</artifactId>
89+
<version>3.0</version>
5590
</dependency>
5691
<dependency>
5792
<groupId>com.google.code.gson</groupId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/**
2+
* Copyright 2015, Emory University
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package edu.emory.clir.clearnlp.extraction.attribute.corpus;
17+
18+
import java.io.File;
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import java.util.Set;
22+
23+
import edu.emory.clir.clearnlp.util.FileUtils;
24+
import edu.emory.clir.clearnlp.util.constant.CharConst;
25+
26+
/**
27+
* @author Yu-Hsin(Henry) Chen ({@code [email protected]})
28+
* @version 1.0
29+
* @since Sep 21, 2015
30+
*/
31+
public abstract class AbstractCorpusRecontructor implements Runnable{
32+
protected CorpusType type;
33+
protected List<String> l_filePaths;
34+
protected Set<String> l_extensions;
35+
protected String inputDirPath, outputDirPath;
36+
37+
public AbstractCorpusRecontructor(CorpusType type, String in_dir, String out_dir){
38+
this.type = type;
39+
setIODirecotries(in_dir, out_dir);
40+
}
41+
42+
public AbstractCorpusRecontructor(CorpusType type, List<String> filePaths, String in_dir, String out_dir){
43+
this.type = type;
44+
inputDirPath = in_dir;
45+
outputDirPath = out_dir;
46+
l_filePaths = filePaths;
47+
}
48+
49+
public AbstractCorpusRecontructor(CorpusType type, String in_dir, String out_dir, Set<String> extension){
50+
this.type = type;
51+
l_extensions = extension;
52+
setIODirecotries(in_dir, out_dir);
53+
}
54+
55+
public void setFilePaths(List<String> filePaths){
56+
l_filePaths = filePaths;
57+
}
58+
59+
protected void setIODirecotries(String in_dir, String out_dir){
60+
inputDirPath = in_dir; outputDirPath = out_dir;
61+
l_filePaths = getFilePaths();
62+
63+
File dir; String dir_path;
64+
for(String filePath : l_filePaths){
65+
dir_path = filePath.substring(0, filePath.lastIndexOf(CharConst.FW_SLASH));
66+
dir_path = getOutputPath(dir_path);
67+
if(!(dir = new File(dir_path)).exists()) dir.mkdirs();
68+
}
69+
}
70+
71+
public String getInputDir(){
72+
return inputDirPath;
73+
}
74+
75+
public String getOutputDir(){
76+
return outputDirPath;
77+
}
78+
79+
public CorpusType getCorpusType(){
80+
return type;
81+
}
82+
83+
protected List<String> getFilePaths(){
84+
if(l_extensions == null || l_extensions.size() == 0)
85+
return FileUtils.getFileList(inputDirPath, "", true);
86+
87+
List<String> list = new ArrayList<>();
88+
for(String ext : l_extensions)
89+
list.addAll(FileUtils.getFileList(inputDirPath, ext, true));
90+
return list;
91+
}
92+
93+
protected String getOutputPath(String inputPath){
94+
int index = inputPath.indexOf(inputDirPath);
95+
if(index >= 0)
96+
return outputDirPath + inputPath.substring(index + inputDirPath.length());
97+
return outputDirPath + FileUtils.getBaseName(inputPath);
98+
}
99+
100+
abstract public void reconstruct(List<String> filePaths);
101+
public void reconstruct(){
102+
reconstruct(l_filePaths);
103+
};
104+
105+
public abstract AbstractCorpusRecontructor clone();
106+
107+
@Override
108+
public void run() {
109+
reconstruct(l_filePaths);
110+
}
111+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/**
2+
* Copyright 2015, Emory University
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package edu.emory.clir.clearnlp.extraction.attribute.corpus;
17+
18+
/**
19+
* @author Yu-Hsin(Henry) Chen ({@code [email protected]})
20+
* @version 1.0
21+
* @since Sep 23, 2015
22+
*/
23+
public enum CorpusType {
24+
RAW,
25+
TSV
26+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/**
2+
* Copyright 2015, Emory University
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package edu.emory.clir.clearnlp.extraction.attribute.corpus;
17+
18+
import java.io.BufferedInputStream;
19+
import java.io.BufferedReader;
20+
import java.io.InputStream;
21+
import java.io.InputStreamReader;
22+
import java.io.PrintWriter;
23+
import java.util.ArrayList;
24+
import java.util.HashSet;
25+
import java.util.List;
26+
import java.util.Set;
27+
import java.util.stream.Collectors;
28+
29+
import edu.emory.clir.clearnlp.collection.pair.IntIntPair;
30+
import edu.emory.clir.clearnlp.dependency.DEPTree;
31+
import edu.emory.clir.clearnlp.extraction.attribute.utils.chunk.AbstractChucker;
32+
import edu.emory.clir.clearnlp.extraction.attribute.utils.chunk.Chunk;
33+
import edu.emory.clir.clearnlp.extraction.attribute.utils.chunk.EnglishProperNounChunker;
34+
import edu.emory.clir.clearnlp.reader.AbstractReader;
35+
import edu.emory.clir.clearnlp.reader.TSVReader;
36+
import edu.emory.clir.clearnlp.util.IOUtils;
37+
import edu.emory.clir.clearnlp.util.Joiner;
38+
import edu.emory.clir.clearnlp.util.Splitter;
39+
import edu.emory.clir.clearnlp.util.constant.StringConst;
40+
import edu.emory.clir.clearnlp.util.lang.TLanguage;
41+
42+
/**
43+
* @author Yu-Hsin(Henry) Chen ({@code [email protected]})
44+
* @version 1.0
45+
* @since Sep 22, 2015
46+
*/
47+
public class EntityTokenCorpusReconstructor extends AbstractCorpusRecontructor{
48+
49+
private NLPDecoder decoder;
50+
private Set<String> s_NERLables;
51+
private AbstractReader<DEPTree> reader;
52+
53+
public EntityTokenCorpusReconstructor(CorpusType type, String in_dir, String out_dir) {
54+
super(type, in_dir, out_dir); init();
55+
}
56+
57+
public EntityTokenCorpusReconstructor(CorpusType type, String in_dir, String out_dir, Set<String> extensions) {
58+
super(type, in_dir, out_dir, extensions); init();
59+
}
60+
61+
public EntityTokenCorpusReconstructor(CorpusType type, String in_dir, String out_dir, Set<String> extensions, Set<String> NERLabels) {
62+
super(type, in_dir, out_dir, extensions);
63+
s_NERLables = NERLabels; init();
64+
}
65+
66+
public EntityTokenCorpusReconstructor(CorpusType type, List<String> filePaths, String in_dir, String out_dir, Set<String> NERLabels){
67+
super(type, filePaths, in_dir, out_dir);
68+
s_NERLables = NERLabels; init();
69+
}
70+
71+
private void init(){
72+
switch (type) {
73+
case RAW: decoder = new NLPDecoder(TLanguage.ENGLISH); break;
74+
case TSV: reader = new TSVReader(0, 1, 2, 3, 7, 4, 5, 6, -1, -1); break;
75+
}
76+
}
77+
78+
@Override
79+
public void reconstruct(List<String> filePaths) {
80+
81+
String line;
82+
DEPTree tree;
83+
List<Chunk> chunks;
84+
List<DEPTree> trees;
85+
List<String> sentence;
86+
AbstractChucker chunker = new EnglishProperNounChunker(s_NERLables);
87+
88+
InputStream input;
89+
PrintWriter writer;
90+
BufferedReader raw_reader;
91+
92+
for(String filePath : filePaths){
93+
try {
94+
input = IOUtils.createFileInputStream(filePath);
95+
writer = new PrintWriter(IOUtils.createBufferedPrintStream(getOutputPath(filePath)));
96+
97+
switch(type){
98+
case RAW:
99+
raw_reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(input)));
100+
raw_reader.readLine();
101+
102+
while( (line = raw_reader.readLine()) != null){
103+
trees = decoder.toDEPTrees(Splitter.splitTabs(line)[1]);
104+
105+
for(DEPTree t : trees){
106+
chunks = chunker.getChunk(t);
107+
sentence = reconstructAux(t, chunks);
108+
writer.println(Joiner.join(sentence, StringConst.SPACE));
109+
}
110+
}
111+
112+
break;
113+
case TSV:
114+
reader.open(new BufferedInputStream(input));
115+
116+
while( (tree = reader.next()) != null){
117+
chunks = chunker.getChunk(tree);
118+
sentence = reconstructAux(tree, chunks);
119+
writer.println(Joiner.join(sentence, StringConst.SPACE));
120+
}
121+
break;
122+
}
123+
124+
input.close();
125+
writer.close();
126+
} catch (Exception e) { e.printStackTrace(); }
127+
}
128+
}
129+
130+
private List<String> reconstructAux(DEPTree tree, List<Chunk> chunks){
131+
IntIntPair span;
132+
List<String> sentence = new ArrayList<>();
133+
int i, j = 0, t_size = tree.size(), c_size = chunks.size();
134+
135+
List<String> entityToken;
136+
span = (j < c_size)? chunks.get(j).getChunkSpan() : null;
137+
for(i = 1; i < t_size; i++){
138+
if(span != null && span.i1 == i){
139+
entityToken = chunks.get(j).getStrippedChunkNodes().stream()
140+
.map(n -> n.getWordForm()).collect(Collectors.toList());
141+
142+
sentence.add(Joiner.join(entityToken, StringConst.EMPTY));
143+
span = (j < c_size)? chunks.get(j).getChunkSpan() : null;
144+
i = span.i2; continue;
145+
}
146+
sentence.add(tree.get(i).getLemma());
147+
}
148+
return sentence;
149+
}
150+
151+
@Override
152+
public AbstractCorpusRecontructor clone() {
153+
if(l_extensions != null && s_NERLables != null)
154+
return new EntityTokenCorpusReconstructor(type, inputDirPath, outputDirPath, new HashSet<String>(l_extensions), new HashSet<String>(s_NERLables));
155+
else if(l_extensions == null)
156+
return new EntityTokenCorpusReconstructor(type, inputDirPath, outputDirPath, null, new HashSet<String>(s_NERLables));
157+
else if(s_NERLables == null)
158+
return new EntityTokenCorpusReconstructor(type, inputDirPath, outputDirPath, new HashSet<String>(l_extensions), null);
159+
else
160+
return new EntityTokenCorpusReconstructor(type, inputDirPath, outputDirPath, null, null);
161+
}
162+
}

0 commit comments

Comments
 (0)