diff --git a/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java b/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java index 171c5b2..28bfc6a 100644 --- a/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java +++ b/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java @@ -92,10 +92,23 @@ private void ProcessDocumentHeader() throws Exception { int documentId = Integer.parseUnsignedInt(matcher.group(1)); emit(String.format("%016x", documentId)); - String title = matcher.group(2); - try (StreamScope scope = new StreamScope(titleStreamId)) { - emit(title); + // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens, + // except in the case that they appear to be URIs. For simplicity, we + // choose to remove them entirely here. + String title = matcher.group(2).replaceAll(":", " "); + + + try (StreamScope scope = new StreamScope(titleStreamId); + TokenStream tokenStream + = analyzer.tokenStream("title", new StringReader(title))) { + tokenStream.reset(); + + CharTermAttribute term = + tokenStream.addAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + emit(term.toString()); + } } } @@ -115,7 +128,10 @@ private void ProcessAllContentLines() throws Exception { private void ProcessOneContentLine() throws IOException { - String line = GetLine(); + // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens, + // except in the case that they appear to be URIs. For simplicity, we + // choose to remove them entirely here. + String line = GetLine().replaceAll(":", " "); try (TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(line))) { diff --git a/src/test/java/org/bitfunnel/workbench/CorpusTest.java b/src/test/java/org/bitfunnel/workbench/CorpusTest.java index 211ab09..c5308cb 100644 --- a/src/test/java/org/bitfunnel/workbench/CorpusTest.java +++ b/src/test/java/org/bitfunnel/workbench/CorpusTest.java @@ -61,15 +61,15 @@ public static Test suite() { */ public void testWikipediaToCorpus() { String wikipedia = - "\n" + - "This is the body text.\n" + + "\n" + + "This is the body:text.\n" + "\n" + "\n" + "Some more body text.\n" + "\n"; byte[] expected = - ("000000000000007b\00000\000one\000\00001\000body\000text\000\000\000" + + ("000000000000007b\00000\000w\000i\000k\000i\000p\000e\000d\000ia\000two\000\00001\000body\000text\000\000\000" + "00000000000001c8\00000\000two\000\00001\000some\000more\000body\000text\000\000\000" + "\000").getBytes(StandardCharsets.UTF_8);