BitFunnel · hausdorff · Oct 28, 2016
diff --git a/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java b/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java
@@ -92,10 +92,23 @@ private void ProcessDocumentHeader() throws Exception {
 
     int documentId = Integer.parseUnsignedInt(matcher.group(1));
     emit(String.format("%016x", documentId));
-    String title = matcher.group(2);
 
-    try (StreamScope scope = new StreamScope(titleStreamId)) {
-      emit(title);
+    // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens,
+    // except in the case that they appear to be URIs. For simplicity, we
+    // choose to remove them entirely here.
+    String title = matcher.group(2).replaceAll(":", " ");
+
+
+    try (StreamScope scope = new StreamScope(titleStreamId);
+         TokenStream tokenStream
+              = analyzer.tokenStream("title", new StringReader(title))) {
+      tokenStream.reset();
+
+      CharTermAttribute term =
+        tokenStream.addAttribute(CharTermAttribute.class);
+      while (tokenStream.incrementToken()) {
+        emit(term.toString());
+      }
     }
   }
 
@@ -115,7 +128,10 @@ private void ProcessAllContentLines() throws Exception {
 
 
   private void ProcessOneContentLine() throws IOException {
-    String line = GetLine();
+    // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens,
+    // except in the case that they appear to be URIs. For simplicity, we
+    // choose to remove them entirely here.
+    String line = GetLine().replaceAll(":", " ");
 
     try (TokenStream tokenStream
              = analyzer.tokenStream("contents", new StringReader(line))) {

diff --git a/src/test/java/org/bitfunnel/workbench/CorpusTest.java b/src/test/java/org/bitfunnel/workbench/CorpusTest.java
@@ -61,15 +61,15 @@ public static Test suite() {
    */
   public void testWikipediaToCorpus() {
     String wikipedia =
-        "<doc id=\"123\" url=\"http://www.bitfunnel.org/123\" title=\"one\">\n" +
-            "This is the body text.\n" +
+        "<doc id=\"123\" url=\"http://www.bitfunnel.org/123\" title=\"w&i|k(i)p-e\\d:ia two\">\n" +
+            "This is the body:text.\n" +
             "</doc>\n" +
             "<doc id=\"456\" url=\"http://www.bitfunnel.org/456\" title=\"two\">\n" +
             "Some more body text.\n" +
             "</doc>\n";
 
     byte[] expected =
-        ("000000000000007b\00000\000one\000\00001\000body\000text\000\000\000" +
+        ("000000000000007b\00000\000w\000i\000k\000i\000p\000e\000d\000ia\000two\000\00001\000body\000text\000\000\000" +
             "00000000000001c8\00000\000two\000\00001\000some\000more\000body\000text\000\000\000" +
             "\000").getBytes(StandardCharsets.UTF_8);