System logger to Slf4J conversion for wordseg package

stanfordnlp · Nov 17, 2015 · 5d55419 · 5d55419
1 parent 01af1b9
commit 5d55419
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 63 deletions.
diff --git a/src/edu/stanford/nlp/wordseg/ChineseDictionary.java b/src/edu/stanford/nlp/wordseg/ChineseDictionary.java
@@ -4,6 +4,9 @@
 import java.util.*;
 import java.util.regex.Pattern;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.io.EncodingPrintWriter;
 import edu.stanford.nlp.io.RuntimeIOException;
@@ -23,13 +26,15 @@ public class ChineseDictionary {
   private static final boolean DEBUG = false;
 
   public static final int MAX_LEXICON_LENGTH = 6;
+
+  private static Logger logger = LoggerFactory.getLogger(ChineseDictionary.class);
   @SuppressWarnings({"unchecked"})
   Set<String>[] words_ = new HashSet[MAX_LEXICON_LENGTH+1];
 
   private ChineseDocumentToSentenceProcessor cdtos_; // = null;
 
   private void serializeDictionary(String serializePath) {
-    System.err.print("Serializing dictionaries to " + serializePath + " ... ");
+    logger.info("Serializing dictionaries to " + serializePath + " ... ");
 
     try {
       ObjectOutputStream oos = IOUtils.writeStreamFromString(serializePath);
@@ -38,9 +43,9 @@ private void serializeDictionary(String serializePath) {
       oos.writeObject(words_);
       //oos.writeObject(cdtos_);
       oos.close();
-      System.err.println("done.");
+      logger.info("done.");
     } catch (Exception e) {
-      System.err.println("Failed");
+      logger.error("Failed", e);
       throw new RuntimeIOException(e);
     }
   }
@@ -52,15 +57,15 @@ private static Set<String>[] loadDictionary(String serializePath) {
       dict[i] = Generics.newHashSet();
     }
 
-    // System.err.print("loading dictionaries from " + serializePath + "...");
+    // logger.info("loading dictionaries from " + serializePath + "...");
 
     try {
       // once we read MAX_LEXICON_LENGTH and cdtos as well
       // now these files only store one object we care about
       //ChineseDictionary.MAX_LEXICON_LENGTH = (int) ois.readObject();
       dict = IOUtils.readObjectFromURLOrClasspathOrFileSystem(serializePath);
     } catch (Exception e) {
-      System.err.println("Failed to load Chinese dictionary " + serializePath);
+      logger.error("Failed to load Chinese dictionary " + serializePath, e);
       throw new RuntimeException(e);
     }
     return dict;
@@ -93,10 +98,9 @@ public ChineseDictionary(String serDicts,
   public ChineseDictionary(String[] dicts,
                            ChineseDocumentToSentenceProcessor cdtos,
                            boolean expandMidDot) {
-    System.err.printf("Loading Chinese dictionaries from %d file%s:%n",
-            dicts.length, (dicts.length == 1) ? "" : "s");
+    logger.info(String.format("Loading Chinese dictionaries from %d file%s:%n", dicts.length, (dicts.length == 1) ? "" : "s"));
     for (String dict : dicts) {
-      System.err.println("  " + dict);
+      logger.info("  " + dict);
     }
 
     for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
@@ -123,15 +127,15 @@ public ChineseDictionary(String[] dicts,
     for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
       total += words_[i].size();
     }
-    System.err.printf("Done. Unique words in ChineseDictionary is: %d.%n", total);
+    logger.info(String.format("Done. Unique words in ChineseDictionary is: %d.%n", total));
   }
 
   private static final Pattern midDot = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR);
 
   private void addDict(String dict, boolean expandMidDot) {
     String content = IOUtils.slurpFileNoExceptions(dict,"utf-8");
     String[] lines = content.split("\n");
-    System.err.println("  " + dict + ": " + lines.length + " entries");
+    logger.info("  " + dict + ": " + lines.length + " entries");
     for (String line : lines) {
       line = line.trim();
       // normalize any midDot
@@ -211,18 +215,18 @@ public static void main(String[] args) {
     /*
     //ChineseDictionary dict = new ChineseDictionary(args[0]);
     for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
-      System.err.println("Length: " + i+": "+dict.words[i].size());
+      logger.info("Length: " + i+": "+dict.words[i].size());
     }
     for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
-      System.err.println("Length: " + i+": "+dict.words[i].size());
+      logger.info("Length: " + i+": "+dict.words[i].size());
       if (dict.words[i].size() < 1000) {
         for (String word : dict.words[i]) {
           EncodingPrintWriter.err.println(word, "UTF-8");
         }
       }
     }
     for  (int i = 1; i < args.length; i++) {
-      System.err.println(args[i] + " " + Boolean.valueOf(dict.contains(args[i])).toString());
+      logger.info(args[i] + " " + Boolean.valueOf(dict.contains(args[i])).toString());
     }
     */
   }

diff --git a/src/edu/stanford/nlp/wordseg/ChineseSegmenterFeatureFactory.java b/src/edu/stanford/nlp/wordseg/ChineseSegmenterFeatureFactory.java
@@ -5,6 +5,9 @@
 import java.util.regex.Pattern;
 import java.io.Serializable;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.sequences.FeatureFactory;
@@ -49,6 +52,8 @@ public class ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends Featur
   private static final long serialVersionUID = 3387166382968763350L;
   private static TagAffixDetector taDetector = null;
 
+  private static Logger logger = LoggerFactory.getLogger(ChineseSegmenterFeatureFactory.class);
+
   public void init(SeqClassifierFlags flags) {
     super.init(flags);
   }
@@ -262,7 +267,7 @@ public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
 
     if (flags.useOutDict2){
       if (outDict == null) {
-        System.err.println("reading "+flags.outDict2+" as a seen lexicon");
+        logger.info("reading "+flags.outDict2+" as a seen lexicon");
         outDict = new CorpusDictionary(flags.outDict2, true);
       }
       features.add(outDict.getW(charp+charc)+"outdict");       // -1 0
@@ -297,7 +302,7 @@ public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
           throw new RuntimeException("only support settings for CTB and PK now.");
         }
       } else {
-        //System.err.println("Using Derived features");
+        //logger.info("Using Derived features");
         tagsets = new String[]{"2","3","4"};
       }
 

diff --git a/src/edu/stanford/nlp/wordseg/ChineseStringUtils.java b/src/edu/stanford/nlp/wordseg/ChineseStringUtils.java
@@ -176,19 +176,19 @@ public static String combineSegmentedSentence(List<CoreLabel> doc,
    */
   private static String postProcessingAnswer(String ans, SeqClassifierFlags flags) {
     if (flags.useHk) {
-      //System.err.println("Using HK post processing.");
+      //logger.info("Using HK post processing.");
       return postProcessingAnswerHK(ans);
     } else if (flags.useAs) {
-      //System.err.println("Using AS post processing.");
+      //logger.info("Using AS post processing.");
       return postProcessingAnswerAS(ans);
     } else if (flags.usePk) {
-      //System.err.println("Using PK post processing.");
+      //logger.info("Using PK post processing.");
       return postProcessingAnswerPK(ans,flags.keepAllWhitespaces);
     } else if (flags.useMsr) {
-      //System.err.println("Using MSR post processing.");
+      //logger.info("Using MSR post processing.");
       return postProcessingAnswerMSR(ans);
     } else {
-      //System.err.println("Using CTB post processing.");
+      //logger.info("Using CTB post processing.");
       return postProcessingAnswerCTB(ans, flags.keepAllWhitespaces, flags.suppressMidDotPostprocessing);
     }
   }
@@ -205,7 +205,7 @@ private static String separatePuncs(String ans) {
                '\u3015'};
     }
     if (puncsPat == null) {
-      //System.err.println("Compile Puncs");
+      //logger.info("Compile Puncs");
       puncsPat = new Pattern[puncs.length];
       for(int i = 0; i < puncs.length; i++) {
         Character punc = puncs[i];
@@ -227,7 +227,7 @@ private static String separatePuncs(Character[] puncs_in, String ans) {
     /* These punctuations are derived directly from the training set. */
     if (puncs == null) { puncs = puncs_in; }
     if (puncsPat == null) {
-      //System.err.println("Compile Puncs");
+      //logger.info("Compile Puncs");
       puncsPat = new Pattern[puncs.length];
       for(int i = 0; i < puncs.length; i++) {
         Character punc = puncs[i];
@@ -329,7 +329,7 @@ private static String processColons(String ans, String numPat) {
   private static String processPercents(String ans, String numPat) {
     //  1. if "6%" then put together
     //  2. if others, separate '%' and others
-    // System.err.println("Process percents called!");
+    // logger.info("Process percents called!");
     // first , just separate all '%'
     Matcher m = percentsPat.matcher(ans);
     ans = m.replaceAll(" $1 ");

diff --git a/src/edu/stanford/nlp/wordseg/CorpusChar.java b/src/edu/stanford/nlp/wordseg/CorpusChar.java
@@ -3,6 +3,9 @@
 import java.util.*;
 import java.io.*;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.io.RuntimeIOException;
 import edu.stanford.nlp.util.Generics;
@@ -16,6 +19,8 @@
 
 
 public class CorpusChar {
+  private static Logger logger = LoggerFactory.getLogger(CorpusChar.class);
+
   private Map <String, Set <String>> charMap;
 
   public CorpusChar(String charlistFilename)  {
@@ -31,15 +36,15 @@ Map<String, Set<String>> getCharMap() {
 
   private Map<String, Set<String>> readDict(String filename)  {
 
-    System.err.println("Loading character dictionary file from " + filename);
+    logger.info("Loading character dictionary file from " + filename);
 
     try {
       InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);      
       BufferedReader DetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
       String DetectorLine;
 
       char_dict = Generics.newHashMap();
-      //System.err.println("DEBUG: in CorpusChar readDict");
+      //logger.debug("DEBUG: in CorpusChar readDict");
       while ((DetectorLine = DetectorReader.readLine()) != null) {
 
         String[] fields = DetectorLine.split("	");
@@ -51,7 +56,7 @@ private Map<String, Set<String>> readDict(String filename)  {
           chars = Generics.newHashSet();
           char_dict.put(tag,chars);
         } 
-        //System.err.println("DEBUG: CorpusChar: "+filename+" "+fields[1]);
+        //logger.debug("DEBUG: CorpusChar: "+filename+" "+fields[1]);
         chars.add(fields[1]);
 
 

diff --git a/src/edu/stanford/nlp/wordseg/CorpusDictionary.java b/src/edu/stanford/nlp/wordseg/CorpusDictionary.java
@@ -3,6 +3,9 @@
 import java.util.*;
 import java.io.*;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import edu.stanford.nlp.io.EncodingPrintWriter;
 import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.io.RuntimeIOException;
@@ -20,6 +23,8 @@
 
 public class CorpusDictionary {
 
+  private static Logger logger = LoggerFactory.getLogger(CorpusDictionary.class);
+
   private Set<String> oneWord; // = null;
 
   /** Load a dictionary of words.
@@ -44,7 +49,7 @@ public Set<String> getTable() {
   private static Set<String> readDict(String filename, boolean normalize)  {
     Set<String> word = Generics.newHashSet();
 
-    System.err.println("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);
+    logger.info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);
 
     try {
       InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
@@ -53,7 +58,7 @@ private static Set<String> readDict(String filename, boolean normalize)  {
       for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
         i++;
         //String[] fields = wordDetectorLine.split("	");
-        //System.err.println("DEBUG: "+filename+" "+wordDetectorLine);
+        //logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
         int origLeng = wordDetectorLine.length();
         wordDetectorLine = wordDetectorLine.trim();
         int newLeng = wordDetectorLine.length();

diff --git a/src/edu/stanford/nlp/wordseg/Gale2007ChineseSegmenterFeatureFactory.java b/src/edu/stanford/nlp/wordseg/Gale2007ChineseSegmenterFeatureFactory.java
@@ -5,6 +5,9 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import edu.stanford.nlp.io.EncodingPrintWriter;
 import edu.stanford.nlp.ling.CoreAnnotation;
 import edu.stanford.nlp.ling.CoreLabel;
@@ -45,6 +48,8 @@ public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extend
 
   private static final int DEBUG = 0;
 
+  private static Logger logger = LoggerFactory.getLogger(Gale2007ChineseSegmenterFeatureFactory.class);
+
   private transient TagAffixDetector taDetector; // = null;
   private transient CorpusDictionary outDict; // = null;
 
@@ -61,7 +66,7 @@ private synchronized void createTADetector() {
 
   private synchronized void createOutDict() {
     if (outDict == null) {
-      System.err.println("reading "+flags.outDict2+" as a seen lexicon");
+      logger.info("reading "+flags.outDict2+" as a seen lexicon");
       outDict = new CorpusDictionary(flags.outDict2);
     }
   }
@@ -514,7 +519,7 @@ protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo,
           throw new RuntimeException("only support settings for CTB and PK now.");
         }
       } else {
-        //System.err.println("Using Derived features");
+        //logger.info("Using Derived features");
         tagsets = new String[]{"2","3","4"};
       }