Update the URL regexes matching urls starting with a vulgar www. (#1185)

* add a new regex for url that matches urls starting with a vulgar www. + tests
kermitt2 · Oct 23, 2024 · be44579 · be44579
1 parent a6ef0ac
commit be44579
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 4 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -3023,7 +3023,7 @@ private StringBuilder getSectionAsTEI(String xmlType,
         StringBuilder output = new StringBuilder();
         SortedSet<DocumentPiece> sectionPart = doc.getDocumentPart(taggingLabel);
 
-        if (sectionPart != null && sectionPart.size() > 0) {
+        if (CollectionUtils.isNotEmpty(sectionPart)) {
             Pair<String, LayoutTokenization> sectionTokenisation = getBodyTextFeatured(doc, sectionPart);
             if (sectionTokenisation != null) {
                 // if featSeg is null, it usually means that no body segment is found in the

diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1174,8 +1174,8 @@ public static List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> to
      */
     public static List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
         String text = LayoutTokensUtil.toText(tokens);
-        List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
-        Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
+        List<OffsetPosition> textResult = new ArrayList<>();
+        Matcher urlMatcher = TextUtilities.urlPattern1.matcher(text);
         while (urlMatcher.find()) {
             textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
         }

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -76,7 +76,9 @@ public class TextUtilities {
     static public final Pattern urlPattern0 = Pattern
         .compile("(?i)(https?|ftp)\\s?:\\s?//\\s?[-A-Z0-9+&@#/%?=~_()|!:,.;]*[-A-Z0-9+&@#/%=~_()|]");
     static public final Pattern urlPattern = Pattern
-        .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}\\/\\/\\s{0,2}[-A-Z0-9+&@#\\/%?=~_()|!:.;]*[-A-Z0-9+&@#\\/%=~_()]");
+        .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]");
+    static public final Pattern urlPattern1 = Pattern
+        .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]|www\\s{0,2}\\.\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]");
 
     // a regular expression for identifying email pattern in text
     // TODO: maybe find a better regex (better == more robust, not more "standard")

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -44,6 +44,47 @@ public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throw
         assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject"));
     }
 
+    @Test
+    public void testCharacterPositionsUrlPattern_URLStartingWithWWW_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on www. github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        assertThat(input.substring(FirstURL.start, FirstURL.end), is("www. github.com/myUsername/MyProject"));
+    }
+
+    @Test
+    public void testCharacterPositionsUrlPattern_URLStartingWithHTTPS_shouldReturnCorrectInterval() throws Exception {
+        final String input = "This work was distributed on https:// www.github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        assertThat(input.substring(FirstURL.start, FirstURL.end), is("https:// www.github.com/myUsername/MyProject"));
+    }
+
+    /**
+     * This test is to confirm the limitation of this method using the regex, where we prefer failing on some cases
+     * rather than have a lot of false positive. This method will be anyway complemented with the annotated links in
+     * the PDF (if available).
+     */
+    @Test
+    public void testCharacterPositionsUrlPattern_URLTruncated_shouldReturnCorrectIntervalWithmissingPartOfURL() throws Exception {
+        final String input = "This work was distributed on https://www. github.com/myUsername/MyProject";
+        List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);
+
+        assertThat(offsetPositions, hasSize(1));
+        OffsetPosition FirstURL = offsetPositions.get(0);
+        assertThat(input.substring(FirstURL.start, FirstURL.end), is("https://www"));
+    }
+
     @Test
     @Ignore("This test will fail, it can be used to test a real case when updating the regular exception")
     public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {