Skip to content

Commit

Permalink
Update the URL regexes matching urls starting with a vulgar www. (#1185)
Browse files Browse the repository at this point in the history
* add a new regex for url that matches urls starting with a vulgar www. + tests
  • Loading branch information
lfoppiano authored Oct 23, 2024
1 parent a6ef0ac commit be44579
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3023,7 +3023,7 @@ private StringBuilder getSectionAsTEI(String xmlType,
StringBuilder output = new StringBuilder();
SortedSet<DocumentPiece> sectionPart = doc.getDocumentPart(taggingLabel);

if (sectionPart != null && sectionPart.size() > 0) {
if (CollectionUtils.isNotEmpty(sectionPart)) {
Pair<String, LayoutTokenization> sectionTokenisation = getBodyTextFeatured(doc, sectionPart);
if (sectionTokenisation != null) {
// if featSeg is null, it usually means that no body segment is found in the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1174,8 +1174,8 @@ public static List<OffsetPosition> tokenPositionsUrlPattern(List<LayoutToken> to
*/
public static List<OffsetPosition> characterPositionsUrlPattern(List<LayoutToken> tokens) {
String text = LayoutTokensUtil.toText(tokens);
List<OffsetPosition> textResult = new ArrayList<OffsetPosition>();
Matcher urlMatcher = TextUtilities.urlPattern.matcher(text);
List<OffsetPosition> textResult = new ArrayList<>();
Matcher urlMatcher = TextUtilities.urlPattern1.matcher(text);
while (urlMatcher.find()) {
textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ public class TextUtilities {
static public final Pattern urlPattern0 = Pattern
.compile("(?i)(https?|ftp)\\s?:\\s?//\\s?[-A-Z0-9+&@#/%?=~_()|!:,.;]*[-A-Z0-9+&@#/%=~_()|]");
static public final Pattern urlPattern = Pattern
.compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}\\/\\/\\s{0,2}[-A-Z0-9+&@#\\/%?=~_()|!:.;]*[-A-Z0-9+&@#\\/%=~_()]");
.compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]");
static public final Pattern urlPattern1 = Pattern
.compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]|www\\s{0,2}\\.\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]");

// a regular expression for identifying email pattern in text
// TODO: maybe find a better regex (better == more robust, not more "standard")
Expand Down
41 changes: 41 additions & 0 deletions grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,47 @@ public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throw
assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject"));
}

@Test
public void testCharacterPositionsUrlPattern_URLStartingWithWWW_shouldReturnCorrectInterval() throws Exception {
final String input = "This work was distributed on www. github.com/myUsername/MyProject";
List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);

assertThat(offsetPositions, hasSize(1));
OffsetPosition FirstURL = offsetPositions.get(0);
assertThat(input.substring(FirstURL.start, FirstURL.end), is("www. github.com/myUsername/MyProject"));
}

@Test
public void testCharacterPositionsUrlPattern_URLStartingWithHTTPS_shouldReturnCorrectInterval() throws Exception {
final String input = "This work was distributed on https:// www.github.com/myUsername/MyProject";
List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);

assertThat(offsetPositions, hasSize(1));
OffsetPosition FirstURL = offsetPositions.get(0);
assertThat(input.substring(FirstURL.start, FirstURL.end), is("https:// www.github.com/myUsername/MyProject"));
}

/**
* This test is to confirm the limitation of this method using the regex, where we prefer failing on some cases
* rather than have a lot of false positive. This method will be anyway complemented with the annotated links in
* the PDF (if available).
*/
@Test
public void testCharacterPositionsUrlPattern_URLTruncated_shouldReturnCorrectIntervalWithmissingPartOfURL() throws Exception {
final String input = "This work was distributed on https://www. github.com/myUsername/MyProject";
List<LayoutToken> tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<OffsetPosition> offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput);

assertThat(offsetPositions, hasSize(1));
OffsetPosition FirstURL = offsetPositions.get(0);
assertThat(input.substring(FirstURL.start, FirstURL.end), is("https://www"));
}

@Test
@Ignore("This test will fail, it can be used to test a real case when updating the regular exception")
public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception {
Expand Down

0 comments on commit be44579

Please sign in to comment.