|
| 1 | +package edu.emory.clir.clearnlp.extraction.attribute.dbpedia; |
| 2 | + |
| 3 | +import java.io.BufferedReader; |
| 4 | +import java.io.ObjectInput; |
| 5 | +import java.io.ObjectInputStream; |
| 6 | +import java.io.ObjectOutput; |
| 7 | +import java.io.ObjectOutputStream; |
| 8 | +import java.util.Collections; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.HashSet; |
| 11 | +import java.util.Map; |
| 12 | +import java.util.Set; |
| 13 | + |
| 14 | +import edu.emory.clir.clearnlp.ner.NERTag; |
| 15 | +import edu.emory.clir.clearnlp.util.IOUtils; |
| 16 | +import edu.emory.clir.clearnlp.util.Splitter; |
| 17 | + |
| 18 | +public class DBPediaMap implements NERTags { |
| 19 | + |
| 20 | + |
| 21 | + protected Map<String,Set<String>> db_map; |
| 22 | + public DBPediaMap(){ |
| 23 | + db_map = new HashMap<>(); |
| 24 | + } |
| 25 | + |
| 26 | + |
| 27 | + public void constructMap(String filePath, boolean rawTags, boolean trimSpace) throws Exception{ |
| 28 | + BufferedReader br = new BufferedReader(IOUtils.createBufferedReader(filePath)); |
| 29 | + String NERTags, word = "", line; |
| 30 | + String[] tags; |
| 31 | + int tabIndex; |
| 32 | + StringBuilder sb = new StringBuilder(); |
| 33 | + while((line = br.readLine())!= null){ |
| 34 | + sb.setLength(0); |
| 35 | + tabIndex = line.lastIndexOf('\t'); |
| 36 | + NERTags = line.substring(tabIndex+1, line.length()); |
| 37 | + word = line.substring(0,tabIndex); |
| 38 | + if(trimSpace){ |
| 39 | + tags = Splitter.splitSpace(word); |
| 40 | + for(int i = 0; i<tags.length; i++){ |
| 41 | + sb.append(tags[i]); |
| 42 | + } |
| 43 | + word = sb.toString(); |
| 44 | + } |
| 45 | + tags = Splitter.splitSpace(NERTags); |
| 46 | + put(word,tags,rawTags); |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + public void put(String word, String[] tags, boolean rawTags){ |
| 51 | + Set<String> set = new HashSet<>(); |
| 52 | + if(rawTags) Collections.addAll(set, tags); |
| 53 | + else for(String tag : tags) set.add(getNERTag(tag)); |
| 54 | + db_map.put(word, set); |
| 55 | + } |
| 56 | + |
| 57 | + private String getNERTag(String tag) { |
| 58 | + if(tag.equals("0")) return NERTags.PERSON; |
| 59 | + if(tag.equals("3") || tag.equals("1")) return NERTags.ORGANIZATION; |
| 60 | + if(tag.equals("2") || tag.equals("5")) return NERTags.LOCATION; |
| 61 | + return NERTags.MISC; |
| 62 | + } |
| 63 | + |
| 64 | + public void readMap(String filePath){ |
| 65 | + |
| 66 | + try { |
| 67 | + ObjectInput in = new ObjectInputStream(IOUtils.createObjectXZBufferedInputStream(filePath)); |
| 68 | + try { |
| 69 | + db_map = (Map<String,Set<String>>) in.readObject(); |
| 70 | + } finally { |
| 71 | + in.close(); |
| 72 | + } |
| 73 | + } catch (Exception e) { e.printStackTrace(); } |
| 74 | + } |
| 75 | + |
| 76 | + public void serializeMap(String filePath){ |
| 77 | + try { |
| 78 | + ObjectOutput out = new ObjectOutputStream(IOUtils.createObjectXZBufferedOutputStream(filePath)); |
| 79 | + try { |
| 80 | + out.writeObject(db_map); |
| 81 | + } finally { |
| 82 | + out.close(); |
| 83 | + } |
| 84 | + } catch (Exception e) { e.printStackTrace(); } |
| 85 | + } |
| 86 | + |
| 87 | + |
| 88 | + |
| 89 | + public Map<String,Set<String>> getDBMap(){ |
| 90 | + return this.db_map; |
| 91 | + } |
| 92 | + |
| 93 | + |
| 94 | + |
| 95 | + static public void main(String[] args) throws Exception{ |
| 96 | + String inputFile = args[0], outputFile = args[1]; |
| 97 | + boolean rawTags = args[2].equals("t") ? true : false; |
| 98 | + boolean trimSpace = args[3].equals("t") ? true: false; |
| 99 | + DBPediaMap db = new DBPediaMap(); |
| 100 | + db.constructMap(inputFile, rawTags, trimSpace); |
| 101 | + db.serializeMap(outputFile); |
| 102 | + } |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | + |
| 107 | +} |
0 commit comments