Skip to content

Commit

Permalink
Make the punctuation filter more robust, and split on newlines.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jose committed Jun 22, 2016
1 parent a73384a commit 1debee6
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ public PTEntityProcessor(EntityRecognizer recognizer)
BlockingQueue<List<Token<String>>> startShingleProducer(String content)
{
BlockingQueue<List<Token<String>>> queue = new LinkedBlockingQueue<List<Token<String>>>();
/* This is a bit of a hack to make sure newlines get treated as sentence ends - unfortunately it's
* necessary because we tokenize by whitespace, so by the time we get to the punctuation filter
* it's too late. */
content = content.replaceAll("\\r?\\n", ". ");
Reader r;
try {
r = new InputStreamReader(new ByteArrayInputStream(content.getBytes(ENCODING)), ENCODING);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ public final class PunctuationFilter extends TokenFilter
*/
public static final int PUNCTUATION_FLAG = 0x1;

/**
* The pattern we use to figure out if there's punctuation.
*/
private static final Pattern PATTERN = Pattern.compile("^(.*?)([\\p{Punct}]+)$",
Pattern.UNICODE_CHARACTER_CLASS);

/**
* The current term.
*/
Expand All @@ -48,11 +54,6 @@ public final class PunctuationFilter extends TokenFilter
*/
private final FlagsAttribute flattribute = addAttribute(FlagsAttribute.class);

/**
* The pattern we use to figure out if there's punctuation.
*/
private final Pattern pattern = Pattern.compile("^(.*?)([\\.!\\?,:;\"'\\(\\)]+)$");

/**
* Our string matcher.
*/
Expand All @@ -65,7 +66,7 @@ public final class PunctuationFilter extends TokenFilter
public PunctuationFilter(TokenStream in)
{
super(in);
m = pattern.matcher(termAtt);
m = PATTERN.matcher(termAtt);
}

/**
Expand Down

0 comments on commit 1debee6

Please sign in to comment.