From 7b07aa842e3cae09d0f6fb70672352c21663032d Mon Sep 17 00:00:00 2001 From: Aryan Vanaik Date: Fri, 27 Oct 2023 13:09:36 +1100 Subject: [PATCH] Added Boyer-Moore string deletion algorithm as a fix for issue #286 --- CHANGELOG.md | 5 +- .../jabref/logic/importer/fetcher/IEEE.java | 4 +- .../logic/importer/fetcher/IEEEcleanup.java | 54 +++++++++++++++++++ .../logic/importer/fetcher/IEEETest.java | 10 ++++ 4 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/jabref/logic/importer/fetcher/IEEEcleanup.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 91fc3245e5c..af1b29a374e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,10 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ## [Unreleased] ### Added - + - New Class called IEEE cleanup + - Added a new testcase called ### Changed - + added cleanup method before setting title field ### Fixed ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/IEEE.java b/src/main/java/org/jabref/logic/importer/fetcher/IEEE.java index 20242693755..d47fa4883d2 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/IEEE.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/IEEE.java @@ -77,7 +77,7 @@ public IEEE(ImportFormatPreferences importFormatPreferences, ImporterPreferences */ private static BibEntry parseJsonResponse(JSONObject jsonEntry, Character keywordSeparator) { BibEntry entry = new BibEntry(); - + IEEEcleanup ieeEcleanup = new IEEEcleanup(); switch (jsonEntry.optString("content_type")) { case "Books" -> entry.setType(StandardEntryType.Book); case "Conferences" -> entry.setType(StandardEntryType.InProceedings); @@ -129,7 +129,7 @@ private static BibEntry parseJsonResponse(JSONObject jsonEntry, Character keywor entry.setField(StandardField.EVENTTITLEADDON, jsonEntry.optString("conference_location")); entry.setField(StandardField.EVENTDATE, jsonEntry.optString("conference_dates")); entry.setField(StandardField.PUBLISHER, jsonEntry.optString("publisher")); - entry.setField(StandardField.TITLE, jsonEntry.optString("title")); + entry.setField(StandardField.TITLE, ieeEcleanup.clean(jsonEntry.optString("title"),"{&}{#}x2014$\\mathsemicolon$")); entry.setField(StandardField.VOLUME, jsonEntry.optString("volume")); return entry; diff --git a/src/main/java/org/jabref/logic/importer/fetcher/IEEEcleanup.java b/src/main/java/org/jabref/logic/importer/fetcher/IEEEcleanup.java new file mode 100644 index 00000000000..c75068da710 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fetcher/IEEEcleanup.java @@ -0,0 +1,54 @@ +package org.jabref.logic.importer.fetcher; + +public class IEEEcleanup { + + private void badCharHeuristic( char []str, int size,int badchar[]) + { + + // Initialize all occurrences as -1 + for (int i = 0; i < 256; i++){ + badchar[i] = -1; + } + + + // Fill the actual value of last occurrence + // of a character (indices of table are ascii and values are index of occurrence) + for (int i = 0; i < size; i++){ + badchar[(int) str[i]] = i; + } + } + + /* A pattern searching function that uses Bad + Character Heuristic of Boyer Moore Algorithm */ + public String clean(String str, String t) { + int m = t.length(); + int n = str.length(); + StringBuilder s = new StringBuilder(str); + int[] badChar = new int[256]; + + badCharHeuristic(t.toCharArray(), m, badChar); + + int i = 0; + while (i <= n - m) { + int j = m - 1; + + // Keep reducing the index j of pattern while characters of pattern + // and string are matching at this shift s + while (j >= 0 && t.charAt(j) == str.charAt(i + j)) { + j--; + } + + // If the pattern is present at current shift, then remove it + if (j < 0) { + s.delete(i, i + m); + n = s.length(); + i += m; + } + else { + // Shift the pattern so that the bad character in text aligns with the last occurrence of it in pattern. + i += Math.max(1, j - badChar[s.charAt(i + j)]); + } + } + return s.toString(); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/IEEETest.java b/src/test/java/org/jabref/logic/importer/fetcher/IEEETest.java index 965fde89a1a..afafa81d019 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/IEEETest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/IEEETest.java @@ -145,6 +145,16 @@ void searchByQuotedQueryFindsEntry() throws Exception { assertEquals(Collections.singletonList(IGOR_NEWCOMERS), fetchedEntries); } + @Test + void testEmDashCleanUp() throws Exception{ + List entry = fetcher.performSearch("10.1109/PERCOMW.2015.7133989"); + if (entry.get(0).getTitle().isPresent()){ + assertEquals("Towards situation-aware adaptive workflows: SitOPT A general purpose situation-aware workflow management system" + ,entry.get(0).getTitle().get()); + } + } + + @Override public SearchBasedFetcher getFetcher() { return fetcher;