diff --git a/scigraph-service/war/src/main/java/io/scigraph/annotation/PTEntityProcessor.java b/scigraph-service/war/src/main/java/io/scigraph/annotation/PTEntityProcessor.java index 073ec18..d93aa4a 100644 --- a/scigraph-service/war/src/main/java/io/scigraph/annotation/PTEntityProcessor.java +++ b/scigraph-service/war/src/main/java/io/scigraph/annotation/PTEntityProcessor.java @@ -65,6 +65,10 @@ public PTEntityProcessor(EntityRecognizer recognizer) BlockingQueue>> startShingleProducer(String content) { BlockingQueue>> queue = new LinkedBlockingQueue>>(); + /* This is a bit of a hack to make sure newlines get treated as sentence ends - unfortunately it's + * necessary because we tokenize by whitespace, so by the time we get to the punctuation filter + * it's too late. */ + content = content.replaceAll("\\r?\\n", ". "); Reader r; try { r = new InputStreamReader(new ByteArrayInputStream(content.getBytes(ENCODING)), ENCODING); diff --git a/scigraph-service/war/src/main/java/io/scigraph/annotation/PunctuationFilter.java b/scigraph-service/war/src/main/java/io/scigraph/annotation/PunctuationFilter.java index e8e3e74..a61d12f 100644 --- a/scigraph-service/war/src/main/java/io/scigraph/annotation/PunctuationFilter.java +++ b/scigraph-service/war/src/main/java/io/scigraph/annotation/PunctuationFilter.java @@ -38,6 +38,12 @@ public final class PunctuationFilter extends TokenFilter */ public static final int PUNCTUATION_FLAG = 0x1; + /** + * The pattern we use to figure out if there's punctuation. + */ + private static final Pattern PATTERN = Pattern.compile("^(.*?)([\\p{Punct}]+)$", + Pattern.UNICODE_CHARACTER_CLASS); + /** * The current term. */ @@ -48,11 +54,6 @@ public final class PunctuationFilter extends TokenFilter */ private final FlagsAttribute flattribute = addAttribute(FlagsAttribute.class); - /** - * The pattern we use to figure out if there's punctuation. - */ - private final Pattern pattern = Pattern.compile("^(.*?)([\\.!\\?,:;\"'\\(\\)]+)$"); - /** * Our string matcher. */ @@ -65,7 +66,7 @@ public final class PunctuationFilter extends TokenFilter public PunctuationFilter(TokenStream in) { super(in); - m = pattern.matcher(termAtt); + m = PATTERN.matcher(termAtt); } /**