Skip to content

Commit d8c60c5

Browse files
db
1 parent 7fe80d8 commit d8c60c5

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package edu.emory.clir.clearnlp.extraction.attribute.dbpedia;
2+
3+
import java.io.BufferedReader;
4+
import java.io.ObjectInput;
5+
import java.io.ObjectInputStream;
6+
import java.io.ObjectOutput;
7+
import java.io.ObjectOutputStream;
8+
import java.util.Collections;
9+
import java.util.HashMap;
10+
import java.util.HashSet;
11+
import java.util.Map;
12+
import java.util.Set;
13+
14+
import edu.emory.clir.clearnlp.ner.NERTag;
15+
import edu.emory.clir.clearnlp.util.IOUtils;
16+
import edu.emory.clir.clearnlp.util.Splitter;
17+
18+
public class DBPediaMap implements NERTags {
19+
20+
21+
protected Map<String,Set<String>> db_map;
22+
public DBPediaMap(){
23+
db_map = new HashMap<>();
24+
}
25+
26+
27+
public void constructMap(String filePath, boolean rawTags, boolean trimSpace) throws Exception{
28+
BufferedReader br = new BufferedReader(IOUtils.createBufferedReader(filePath));
29+
String NERTags, word = "", line;
30+
String[] tags;
31+
int tabIndex;
32+
StringBuilder sb = new StringBuilder();
33+
while((line = br.readLine())!= null){
34+
sb.setLength(0);
35+
tabIndex = line.lastIndexOf('\t');
36+
NERTags = line.substring(tabIndex+1, line.length());
37+
word = line.substring(0,tabIndex);
38+
if(trimSpace){
39+
tags = Splitter.splitSpace(word);
40+
for(int i = 0; i<tags.length; i++){
41+
sb.append(tags[i]);
42+
}
43+
word = sb.toString();
44+
}
45+
tags = Splitter.splitSpace(NERTags);
46+
put(word,tags,rawTags);
47+
}
48+
}
49+
50+
public void put(String word, String[] tags, boolean rawTags){
51+
Set<String> set = new HashSet<>();
52+
if(rawTags) Collections.addAll(set, tags);
53+
else for(String tag : tags) set.add(getNERTag(tag));
54+
db_map.put(word, set);
55+
}
56+
57+
private String getNERTag(String tag) {
58+
if(tag.equals("0")) return NERTags.PERSON;
59+
if(tag.equals("3") || tag.equals("1")) return NERTags.ORGANIZATION;
60+
if(tag.equals("2") || tag.equals("5")) return NERTags.LOCATION;
61+
return NERTags.MISC;
62+
}
63+
64+
public void readMap(String filePath){
65+
66+
try {
67+
ObjectInput in = new ObjectInputStream(IOUtils.createObjectXZBufferedInputStream(filePath));
68+
try {
69+
db_map = (Map<String,Set<String>>) in.readObject();
70+
} finally {
71+
in.close();
72+
}
73+
} catch (Exception e) { e.printStackTrace(); }
74+
}
75+
76+
public void serializeMap(String filePath){
77+
try {
78+
ObjectOutput out = new ObjectOutputStream(IOUtils.createObjectXZBufferedOutputStream(filePath));
79+
try {
80+
out.writeObject(db_map);
81+
} finally {
82+
out.close();
83+
}
84+
} catch (Exception e) { e.printStackTrace(); }
85+
}
86+
87+
88+
89+
public Map<String,Set<String>> getDBMap(){
90+
return this.db_map;
91+
}
92+
93+
94+
95+
static public void main(String[] args) throws Exception{
96+
String inputFile = args[0], outputFile = args[1];
97+
boolean rawTags = args[2].equals("t") ? true : false;
98+
boolean trimSpace = args[3].equals("t") ? true: false;
99+
DBPediaMap db = new DBPediaMap();
100+
db.constructMap(inputFile, rawTags, trimSpace);
101+
db.serializeMap(outputFile);
102+
}
103+
104+
105+
106+
107+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package edu.emory.clir.clearnlp.extraction.attribute.dbpedia;
2+
3+
public interface NERTags {
4+
5+
static public final String LOCATION = "LOC";
6+
static public final String PERSON = "PER";
7+
static public final String ORGANIZATION = "ORG";
8+
static public final String MISC = "MISC";
9+
10+
}

0 commit comments

Comments
 (0)