diff --git a/src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java b/src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java index 948b673..35aed6f 100644 --- a/src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java +++ b/src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java @@ -100,8 +100,8 @@ public static boolean checkIfNoCodeBlock(String target) { } public static double getJaroWinklerScore(String original, String target, double percentage) { - String originalBody = stripTags(original); - String targetBody = stripTags(target); + String targetBody = removeHtml(target); + String originalBody = removeHtml(original); double score = 1.0; if (targetBody.length() < originalBody.length() * percentage) { diff --git a/src/test/java/bugs/stackoverflow/belisarius/utils/CheckUtilsTest.java b/src/test/java/bugs/stackoverflow/belisarius/utils/CheckUtilsTest.java index ee0db56..c3d8380 100644 --- a/src/test/java/bugs/stackoverflow/belisarius/utils/CheckUtilsTest.java +++ b/src/test/java/bugs/stackoverflow/belisarius/utils/CheckUtilsTest.java @@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test; public class CheckUtilsTest { + private final double percentage = 0.8; // Blacklisted and offensive words strings private final String blacklistedQuestionWords = "problem now solved problem has been now fixed found my solution " @@ -67,4 +68,58 @@ public void repeatedWordTest() { assertEquals(CheckUtils.checkRepeatedWords(repeatedWords).size(), 1); assertEquals(CheckUtils.checkRepeatedWords(notRepeatedWords).size(), 6); } + + @Test + public void jaroWinklerScoreTest() { + double score1 = CheckUtils.getJaroWinklerScore( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, " + + "sed do eiusmod tempor incididunt ut labore et dolore magna " + + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + + "aute irure dolor in reprehenderit in voluptate velit esse cillum " + + "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat " + + "non proident, sunt in culpa qui officia deserunt mollit anim id " + + "est laborum.", + "This text has nothing to do with the above one.", + percentage + ); + assertTrue(score1 < 0.6); // this should be caught + + double score2 = CheckUtils.getJaroWinklerScore( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, " + + "sed do eiusmod tempor incididunt ut labore et dolore magna " + + "aliqua.
Ut enim
ad minim veniam, quis nostrud exercitation " + + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + + "
aute irure dolor in reprehenderit in voluptate velit esse cillum
" + + "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat " + + "non proident, sunt in culpa qui officia deserunt mollit anim id " + + "est laborum.", + "This text has nothing to do with the above one.", + percentage + ); + assertTrue(score2 < 0.6); // this should also be caught + + // and have the same score, as HTML tags are stripped + assertEquals(score1, score2); + + double score3 = CheckUtils.getJaroWinklerScore( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, " + + "sed do eiusmod tempor incididunt ut labore et dolore magna " + + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + + "aute irure dolor in reprehenderit in voluptate velit esse cillum " + + "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat " + + "non proident, sunt in culpa qui officia deserunt mollit anim id " + + "est laborum.", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, " + + "sed do eiusmoda tempor incididunt ut labore et dolore magna " + + "aliqua. Ut enim ad minim veniamd, quis nostrud exercitation " + + "ullamco laboris nisi ut aliquixp ex ea commodo consequat. Duis " + + "aute irure dolor in reprehendxerit in voluptate velit esse cillum ", + percentage + ); + // edit removes text, but does not change body completely + // => should not be reported + assertTrue(score3 > 0.6); + } } \ No newline at end of file