Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New duplicate algorithm to check for similar entries #52

Merged
merged 14 commits into from
Feb 6, 2025
Prev Previous commit
Next Next commit
Added more params to deduplicate similar function
Signed-off-by: George Araújo <[email protected]>
george-gca committed Jan 9, 2025
commit 8b921e7a565a7e5de38d694a3a81005d6a9c7219
81 changes: 43 additions & 38 deletions asreviewcontrib/datatools/dedup.py
Original file line number Diff line number Diff line change
@@ -10,28 +10,56 @@
from tqdm import tqdm


_SYMBOLS_REGEX = re.compile(r'[^ \w\d\-_]')
_SPACES_REGEX = re.compile(r'\s+')
def _print_similar_list(similar_list: list[tuple[int, int]], data: pd.Series):
print_seq_matcher = SequenceMatcher()
console = Console()
print('Found similar titles at lines:')

for i, j in similar_list:
print_seq_matcher.set_seq1(data.iloc[i])
print_seq_matcher.set_seq2(data.iloc[j])
text = Text()
text.append(f"\nLines {i+1} and {j+1}:\n", style='bold')

for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
if tag == 'replace':
# add rich strikethrough
text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
text.append(f'{data.iloc[j][j1:j2]}', style='green')
if tag == 'delete':
text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
if tag == 'insert':
text.append(f'{data.iloc[j][j1:j2]}', style='green')
if tag == 'equal':
text.append(f'{data.iloc[i][i1:i2]}', style='dim')

console.print(text)

print('')


def drop_duplicates_by_similarity(
asdata: ASReviewData,
similarity: float = 0.98,
use_abstract: bool = True,
skip_abstract: bool = False,
discard_stopwords: bool = False,
stopwords_language: str = 'english',
verbose: bool = True):
strict_similarity: bool = False,
verbose: bool = False):

if use_abstract:
data = pd.Series(asdata.texts)
else:
if skip_abstract:
data = asdata.df['title']
else:
data = pd.Series(asdata.texts)

symbols_regex = re.compile(r'[^ \w\d\-_]')
spaces_regex = re.compile(r'\s+')

s = (
data
.apply(ftfy.fix_text)
.str.replace(_SYMBOLS_REGEX, '', regex=True)
.str.replace(_SPACES_REGEX, ' ', regex=True)
.str.replace(symbols_regex, '', regex=True)
.str.replace(spaces_regex, ' ', regex=True)
.str.lower()
.str.strip()
.replace("", None)
@@ -40,14 +68,14 @@ def drop_duplicates_by_similarity(
if discard_stopwords:
try:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words(stopwords_language))
stopwords_set = set(stopwords.words(stopwords_language))
except LookupError:
import nltk
nltk.download('stopwords')
STOPWORDS = set(stopwords.words(stopwords_language))
stopwords_set = set(stopwords.words(stopwords_language))

STOPWORDS_REGEX = re.compile(rf'\b{"\\b|\\b".join(STOPWORDS)}\b')
s = s.str.replace(STOPWORDS_REGEX, '', regex=True)
stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
s = s.str.replace(stopwords_regex, '', regex=True)

duplicated = (s.duplicated()) & (s.notnull())
seq_matcher = SequenceMatcher()
@@ -63,36 +91,13 @@ def drop_duplicates_by_similarity(
for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items():
seq_matcher.set_seq1(t)

# could also add: and seq_matcher.ratio() > similarity:
if seq_matcher.real_quick_ratio() > similarity and seq_matcher.quick_ratio() > similarity:
if seq_matcher.real_quick_ratio() > similarity and seq_matcher.quick_ratio() > similarity and (not strict_similarity or seq_matcher.ratio() > similarity):
if verbose and not duplicated[j]:
similar_list.append((i, j))

duplicated[j] = True

if verbose:
print_seq_matcher = SequenceMatcher()
console = Console()
print('Found similar titles at lines')

for i, j in similar_list:
print_seq_matcher.set_seq1(data.iloc[i])
print_seq_matcher.set_seq2(data.iloc[j])
text = Text()
text.append(f"\nLines {i} and {j}:\n", style='bold')

for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
if tag == 'replace':
# add rich strikethrough
text.append(f'{data.iloc[i][i1:i2]}', style='strike')
text.append(f'{data.iloc[j][j1:j2]}')
if tag == 'delete':
text.append(f'{data.iloc[i][i1:i2]}', style='strike')
if tag == 'insert':
text.append(f'{data.iloc[j][j1:j2]}')
if tag == 'equal':
text.append(f'{data.iloc[i][i1:i2]}', style='dim')

console.print(str(text))
_print_similar_list(similar_list, data)

asdata.df = asdata.df[~duplicated].reset_index(drop=True)