Skip to content

Commit

Permalink
System logger to Slf4J conversion for wordseg package
Browse files Browse the repository at this point in the history
  • Loading branch information
Taylor Raack committed Nov 17, 2015
1 parent 01af1b9 commit 5d55419
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 63 deletions.
30 changes: 17 additions & 13 deletions src/edu/stanford/nlp/wordseg/ChineseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import java.util.*;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.RuntimeIOException;
Expand All @@ -23,13 +26,15 @@ public class ChineseDictionary {
private static final boolean DEBUG = false;

public static final int MAX_LEXICON_LENGTH = 6;

private static Logger logger = LoggerFactory.getLogger(ChineseDictionary.class);
@SuppressWarnings({"unchecked"})
Set<String>[] words_ = new HashSet[MAX_LEXICON_LENGTH+1];

private ChineseDocumentToSentenceProcessor cdtos_; // = null;

private void serializeDictionary(String serializePath) {
System.err.print("Serializing dictionaries to " + serializePath + " ... ");
logger.info("Serializing dictionaries to " + serializePath + " ... ");

try {
ObjectOutputStream oos = IOUtils.writeStreamFromString(serializePath);
Expand All @@ -38,9 +43,9 @@ private void serializeDictionary(String serializePath) {
oos.writeObject(words_);
//oos.writeObject(cdtos_);
oos.close();
System.err.println("done.");
logger.info("done.");
} catch (Exception e) {
System.err.println("Failed");
logger.error("Failed", e);
throw new RuntimeIOException(e);
}
}
Expand All @@ -52,15 +57,15 @@ private static Set<String>[] loadDictionary(String serializePath) {
dict[i] = Generics.newHashSet();
}

// System.err.print("loading dictionaries from " + serializePath + "...");
// logger.info("loading dictionaries from " + serializePath + "...");

try {
// once we read MAX_LEXICON_LENGTH and cdtos as well
// now these files only store one object we care about
//ChineseDictionary.MAX_LEXICON_LENGTH = (int) ois.readObject();
dict = IOUtils.readObjectFromURLOrClasspathOrFileSystem(serializePath);
} catch (Exception e) {
System.err.println("Failed to load Chinese dictionary " + serializePath);
logger.error("Failed to load Chinese dictionary " + serializePath, e);
throw new RuntimeException(e);
}
return dict;
Expand Down Expand Up @@ -93,10 +98,9 @@ public ChineseDictionary(String serDicts,
public ChineseDictionary(String[] dicts,
ChineseDocumentToSentenceProcessor cdtos,
boolean expandMidDot) {
System.err.printf("Loading Chinese dictionaries from %d file%s:%n",
dicts.length, (dicts.length == 1) ? "" : "s");
logger.info(String.format("Loading Chinese dictionaries from %d file%s:%n", dicts.length, (dicts.length == 1) ? "" : "s"));
for (String dict : dicts) {
System.err.println(" " + dict);
logger.info(" " + dict);
}

for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
Expand All @@ -123,15 +127,15 @@ public ChineseDictionary(String[] dicts,
for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
total += words_[i].size();
}
System.err.printf("Done. Unique words in ChineseDictionary is: %d.%n", total);
logger.info(String.format("Done. Unique words in ChineseDictionary is: %d.%n", total));
}

private static final Pattern midDot = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR);

private void addDict(String dict, boolean expandMidDot) {
String content = IOUtils.slurpFileNoExceptions(dict,"utf-8");
String[] lines = content.split("\n");
System.err.println(" " + dict + ": " + lines.length + " entries");
logger.info(" " + dict + ": " + lines.length + " entries");
for (String line : lines) {
line = line.trim();
// normalize any midDot
Expand Down Expand Up @@ -211,18 +215,18 @@ public static void main(String[] args) {
/*
//ChineseDictionary dict = new ChineseDictionary(args[0]);
for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
System.err.println("Length: " + i+": "+dict.words[i].size());
logger.info("Length: " + i+": "+dict.words[i].size());
}
for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
System.err.println("Length: " + i+": "+dict.words[i].size());
logger.info("Length: " + i+": "+dict.words[i].size());
if (dict.words[i].size() < 1000) {
for (String word : dict.words[i]) {
EncodingPrintWriter.err.println(word, "UTF-8");
}
}
}
for (int i = 1; i < args.length; i++) {
System.err.println(args[i] + " " + Boolean.valueOf(dict.contains(args[i])).toString());
logger.info(args[i] + " " + Boolean.valueOf(dict.contains(args[i])).toString());
}
*/
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import java.util.regex.Pattern;
import java.io.Serializable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.FeatureFactory;
Expand Down Expand Up @@ -49,6 +52,8 @@ public class ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends Featur
private static final long serialVersionUID = 3387166382968763350L;
private static TagAffixDetector taDetector = null;

private static Logger logger = LoggerFactory.getLogger(ChineseSegmenterFeatureFactory.class);

public void init(SeqClassifierFlags flags) {
super.init(flags);
}
Expand Down Expand Up @@ -262,7 +267,7 @@ public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {

if (flags.useOutDict2){
if (outDict == null) {
System.err.println("reading "+flags.outDict2+" as a seen lexicon");
logger.info("reading "+flags.outDict2+" as a seen lexicon");
outDict = new CorpusDictionary(flags.outDict2, true);
}
features.add(outDict.getW(charp+charc)+"outdict"); // -1 0
Expand Down Expand Up @@ -297,7 +302,7 @@ public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
throw new RuntimeException("only support settings for CTB and PK now.");
}
} else {
//System.err.println("Using Derived features");
//logger.info("Using Derived features");
tagsets = new String[]{"2","3","4"};
}

Expand Down
16 changes: 8 additions & 8 deletions src/edu/stanford/nlp/wordseg/ChineseStringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -176,19 +176,19 @@ public static String combineSegmentedSentence(List<CoreLabel> doc,
*/
private static String postProcessingAnswer(String ans, SeqClassifierFlags flags) {
if (flags.useHk) {
//System.err.println("Using HK post processing.");
//logger.info("Using HK post processing.");
return postProcessingAnswerHK(ans);
} else if (flags.useAs) {
//System.err.println("Using AS post processing.");
//logger.info("Using AS post processing.");
return postProcessingAnswerAS(ans);
} else if (flags.usePk) {
//System.err.println("Using PK post processing.");
//logger.info("Using PK post processing.");
return postProcessingAnswerPK(ans,flags.keepAllWhitespaces);
} else if (flags.useMsr) {
//System.err.println("Using MSR post processing.");
//logger.info("Using MSR post processing.");
return postProcessingAnswerMSR(ans);
} else {
//System.err.println("Using CTB post processing.");
//logger.info("Using CTB post processing.");
return postProcessingAnswerCTB(ans, flags.keepAllWhitespaces, flags.suppressMidDotPostprocessing);
}
}
Expand All @@ -205,7 +205,7 @@ private static String separatePuncs(String ans) {
'\u3015'};
}
if (puncsPat == null) {
//System.err.println("Compile Puncs");
//logger.info("Compile Puncs");
puncsPat = new Pattern[puncs.length];
for(int i = 0; i < puncs.length; i++) {
Character punc = puncs[i];
Expand All @@ -227,7 +227,7 @@ private static String separatePuncs(Character[] puncs_in, String ans) {
/* These punctuations are derived directly from the training set. */
if (puncs == null) { puncs = puncs_in; }
if (puncsPat == null) {
//System.err.println("Compile Puncs");
//logger.info("Compile Puncs");
puncsPat = new Pattern[puncs.length];
for(int i = 0; i < puncs.length; i++) {
Character punc = puncs[i];
Expand Down Expand Up @@ -329,7 +329,7 @@ private static String processColons(String ans, String numPat) {
private static String processPercents(String ans, String numPat) {
// 1. if "6%" then put together
// 2. if others, separate '%' and others
// System.err.println("Process percents called!");
// logger.info("Process percents called!");
// first , just separate all '%'
Matcher m = percentsPat.matcher(ans);
ans = m.replaceAll(" $1 ");
Expand Down
11 changes: 8 additions & 3 deletions src/edu/stanford/nlp/wordseg/CorpusChar.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import java.util.*;
import java.io.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;
Expand All @@ -16,6 +19,8 @@


public class CorpusChar {
private static Logger logger = LoggerFactory.getLogger(CorpusChar.class);

private Map <String, Set <String>> charMap;

public CorpusChar(String charlistFilename) {
Expand All @@ -31,15 +36,15 @@ Map<String, Set<String>> getCharMap() {

private Map<String, Set<String>> readDict(String filename) {

System.err.println("Loading character dictionary file from " + filename);
logger.info("Loading character dictionary file from " + filename);

try {
InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
BufferedReader DetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String DetectorLine;

char_dict = Generics.newHashMap();
//System.err.println("DEBUG: in CorpusChar readDict");
//logger.debug("DEBUG: in CorpusChar readDict");
while ((DetectorLine = DetectorReader.readLine()) != null) {

String[] fields = DetectorLine.split(" ");
Expand All @@ -51,7 +56,7 @@ private Map<String, Set<String>> readDict(String filename) {
chars = Generics.newHashSet();
char_dict.put(tag,chars);
}
//System.err.println("DEBUG: CorpusChar: "+filename+" "+fields[1]);
//logger.debug("DEBUG: CorpusChar: "+filename+" "+fields[1]);
chars.add(fields[1]);


Expand Down
9 changes: 7 additions & 2 deletions src/edu/stanford/nlp/wordseg/CorpusDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import java.util.*;
import java.io.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
Expand All @@ -20,6 +23,8 @@

public class CorpusDictionary {

private static Logger logger = LoggerFactory.getLogger(CorpusDictionary.class);

private Set<String> oneWord; // = null;

/** Load a dictionary of words.
Expand All @@ -44,7 +49,7 @@ public Set<String> getTable() {
private static Set<String> readDict(String filename, boolean normalize) {
Set<String> word = Generics.newHashSet();

System.err.println("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);
logger.info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);

try {
InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
Expand All @@ -53,7 +58,7 @@ private static Set<String> readDict(String filename, boolean normalize) {
for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
i++;
//String[] fields = wordDetectorLine.split(" ");
//System.err.println("DEBUG: "+filename+" "+wordDetectorLine);
//logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
int origLeng = wordDetectorLine.length();
wordDetectorLine = wordDetectorLine.trim();
int newLeng = wordDetectorLine.length();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
Expand Down Expand Up @@ -45,6 +48,8 @@ public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extend

private static final int DEBUG = 0;

private static Logger logger = LoggerFactory.getLogger(Gale2007ChineseSegmenterFeatureFactory.class);

private transient TagAffixDetector taDetector; // = null;
private transient CorpusDictionary outDict; // = null;

Expand All @@ -61,7 +66,7 @@ private synchronized void createTADetector() {

private synchronized void createOutDict() {
if (outDict == null) {
System.err.println("reading "+flags.outDict2+" as a seen lexicon");
logger.info("reading "+flags.outDict2+" as a seen lexicon");
outDict = new CorpusDictionary(flags.outDict2);
}
}
Expand Down Expand Up @@ -514,7 +519,7 @@ protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo,
throw new RuntimeException("only support settings for CTB and PK now.");
}
} else {
//System.err.println("Using Derived features");
//logger.info("Using Derived features");
tagsets = new String[]{"2","3","4"};
}

Expand Down
Loading

0 comments on commit 5d55419

Please sign in to comment.