Added more params to deduplicate similar function

Signed-off-by: George Araújo <[email protected]>
asreview · PeterLombaers · Feb 6, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 9, 2025
commit 8b921e7a565a7e5de38d694a3a81005d6a9c7219
diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
@@ -10,28 +10,56 @@
 from tqdm import tqdm
 
 
-_SYMBOLS_REGEX = re.compile(r'[^ \w\d\-_]')
-_SPACES_REGEX = re.compile(r'\s+')
+def _print_similar_list(similar_list: list[tuple[int, int]], data: pd.Series):
+    print_seq_matcher = SequenceMatcher()
+    console = Console()
+    print('Found similar titles at lines:')
+
+    for i, j in similar_list:
+        print_seq_matcher.set_seq1(data.iloc[i])
+        print_seq_matcher.set_seq2(data.iloc[j])
+        text = Text()
+        text.append(f"\nLines {i+1} and {j+1}:\n", style='bold')
+
+        for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
+            if tag == 'replace':
+                # add rich strikethrough
+                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
+                text.append(f'{data.iloc[j][j1:j2]}', style='green')
+            if tag == 'delete':
+                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
+            if tag == 'insert':
+                text.append(f'{data.iloc[j][j1:j2]}', style='green')
+            if tag == 'equal':
+                text.append(f'{data.iloc[i][i1:i2]}', style='dim')
+
+        console.print(text)
+
+    print('')
 
 
 def drop_duplicates_by_similarity(
         asdata: ASReviewData,
         similarity: float = 0.98,
-        use_abstract: bool = True,
+        skip_abstract: bool = False,
         discard_stopwords: bool = False,
         stopwords_language: str = 'english',
-        verbose: bool = True):
+        strict_similarity: bool = False,
+        verbose: bool = False):
 
-    if use_abstract:
-        data = pd.Series(asdata.texts)
-    else:
+    if skip_abstract:
         data = asdata.df['title']
+    else:
+        data = pd.Series(asdata.texts)
+
+    symbols_regex = re.compile(r'[^ \w\d\-_]')
+    spaces_regex = re.compile(r'\s+')
 
     s = (
         data
         .apply(ftfy.fix_text)
-        .str.replace(_SYMBOLS_REGEX, '', regex=True)
-        .str.replace(_SPACES_REGEX, ' ', regex=True)
+        .str.replace(symbols_regex, '', regex=True)
+        .str.replace(spaces_regex, ' ', regex=True)
         .str.lower()
         .str.strip()
         .replace("", None)
@@ -40,14 +68,14 @@ def drop_duplicates_by_similarity(
     if discard_stopwords:
         try:
             from nltk.corpus import stopwords
-            STOPWORDS = set(stopwords.words(stopwords_language))
+            stopwords_set = set(stopwords.words(stopwords_language))
         except LookupError:
             import nltk
             nltk.download('stopwords')
-            STOPWORDS = set(stopwords.words(stopwords_language))
+            stopwords_set = set(stopwords.words(stopwords_language))
 
-        STOPWORDS_REGEX = re.compile(rf'\b{"\\b|\\b".join(STOPWORDS)}\b')
-        s = s.str.replace(STOPWORDS_REGEX, '', regex=True)
+        stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
+        s = s.str.replace(stopwords_regex, '', regex=True)
 
     duplicated = (s.duplicated()) & (s.notnull())
     seq_matcher = SequenceMatcher()
@@ -63,36 +91,13 @@ def drop_duplicates_by_similarity(
         for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items():
             seq_matcher.set_seq1(t)
 
-            # could also add: and seq_matcher.ratio() > similarity:
-            if seq_matcher.real_quick_ratio() > similarity and seq_matcher.quick_ratio() > similarity:
+            if seq_matcher.real_quick_ratio() > similarity and seq_matcher.quick_ratio() > similarity and (not strict_similarity or seq_matcher.ratio() > similarity):
                 if verbose and not duplicated[j]:
                     similar_list.append((i, j))
 
                 duplicated[j] = True
 
     if verbose:
-        print_seq_matcher = SequenceMatcher()
-        console = Console()
-        print('Found similar titles at lines')
-
-        for i, j in similar_list:
-            print_seq_matcher.set_seq1(data.iloc[i])
-            print_seq_matcher.set_seq2(data.iloc[j])
-            text = Text()
-            text.append(f"\nLines {i} and {j}:\n", style='bold')
-
-            for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
-                if tag == 'replace':
-                    # add rich strikethrough
-                    text.append(f'{data.iloc[i][i1:i2]}', style='strike')
-                    text.append(f'{data.iloc[j][j1:j2]}')
-                if tag == 'delete':
-                    text.append(f'{data.iloc[i][i1:i2]}', style='strike')
-                if tag == 'insert':
-                    text.append(f'{data.iloc[j][j1:j2]}')
-                if tag == 'equal':
-                    text.append(f'{data.iloc[i][i1:i2]}', style='dim')
-
-            console.print(str(text))
+        _print_similar_list(similar_list, data)
 
     asdata.df = asdata.df[~duplicated].reset_index(drop=True)