Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New duplicate algorithm to check for similar entries #52

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
64 changes: 60 additions & 4 deletions asreviewcontrib/datatools/dedup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from argparse import Namespace
from difflib import SequenceMatcher

import ftfy
Expand All @@ -9,7 +10,7 @@
from tqdm import tqdm


def _print_similar_list(similar_list: list[tuple[int, int]], data: pd.Series):
def _print_similar_list(similar_list: list[tuple[int, int]], data: pd.Series) -> None:
print_seq_matcher = SequenceMatcher()
console = Console()
print('Found similar titles at lines:')
Expand Down Expand Up @@ -44,7 +45,7 @@ def drop_duplicates_by_similarity(
discard_stopwords: bool = False,
stopwords_language: str = 'english',
strict_similarity: bool = False,
verbose: bool = False):
verbose: bool = False) -> None:

if skip_abstract:
data = asdata.df['title']
Expand Down Expand Up @@ -76,7 +77,7 @@ def drop_duplicates_by_similarity(
stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
s = s.str.replace(stopwords_regex, '', regex=True)

duplicated = (s.duplicated()) & (s.notnull())
duplicated = [False] * len(s)
george-gca marked this conversation as resolved.
Show resolved Hide resolved
seq_matcher = SequenceMatcher()

if verbose:
Expand All @@ -102,4 +103,59 @@ def drop_duplicates_by_similarity(
if verbose:
_print_similar_list(similar_list, data)

asdata.df = asdata.df[~duplicated].reset_index(drop=True)
asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)


def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
initial_length = len(asdata.df)

if args.pid not in asdata.df.columns:
print(
f"Not using {args.pid} for deduplication "
"because there is no such data."
)

if not args.similar:
george-gca marked this conversation as resolved.
Show resolved Hide resolved
if args.verbose:
before_dedup = asdata.df.copy()

# retrieve deduplicated ASReview data object
asdata.drop_duplicates(pid=args.pid, inplace=True, reset_index=False)
duplicate_entries = before_dedup[~before_dedup.index.isin(asdata.df.index)]

if len(duplicate_entries) > 0:
print("Duplicate entries:")
for i, row in duplicate_entries.iterrows():
print(f"\tLine {i} - {row['title']}")
george-gca marked this conversation as resolved.
Show resolved Hide resolved

asdata.df.reset_index(drop=True, inplace=True)

else:
# retrieve deduplicated ASReview data object
asdata.drop_duplicates(pid=args.pid, inplace=True)

else:
drop_duplicates_by_similarity(
asdata,
args.threshold,
args.title_only,
args.stopwords,
args.stopwords_language,
args.strict,
args.verbose,
)

# count duplicates
n_dup = initial_length - len(asdata.df)

if args.output_path:
asdata.to_file(args.output_path)
print(
f"Removed {n_dup} duplicates from dataset with"
f" {initial_length} records."
)
else:
print(
f"Found {n_dup} duplicates in dataset with"
f" {initial_length} records."
)
48 changes: 7 additions & 41 deletions asreviewcontrib/datatools/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from asreviewcontrib.datatools.compose import compose
from asreviewcontrib.datatools.convert import _parse_arguments_convert
from asreviewcontrib.datatools.convert import convert
from asreviewcontrib.datatools.dedup import drop_duplicates_by_similarity
from asreviewcontrib.datatools.dedup import deduplicate_data, drop_duplicates_by_similarity
from asreviewcontrib.datatools.describe import _parse_arguments_describe
from asreviewcontrib.datatools.describe import describe
from asreviewcontrib.datatools.sample import _parse_arguments_sample
Expand Down Expand Up @@ -61,7 +61,7 @@ def execute(self, argv):
help="Persistent identifier used for deduplication. Default: doi.",
)
dedup_parser.add_argument(
"--drop_similar",
"--similar",
action='store_true',
help="Drop similar records.",
)
Expand All @@ -72,17 +72,17 @@ def execute(self, argv):
help="Similarity threshold for deduplication. Default: 0.98.",
)
dedup_parser.add_argument(
"--skip_abstract",
"--title_only",
action='store_true',
help="Use only title for deduplication.",
)
dedup_parser.add_argument(
george-gca marked this conversation as resolved.
Show resolved Hide resolved
"--discard_stopwords",
"--stopwords",
action='store_true',
help="Discard stopwords for deduplication.",
help="Ignore stopwords for deduplication, focusing on main words.",
)
dedup_parser.add_argument(
george-gca marked this conversation as resolved.
Show resolved Hide resolved
"--strict_similarity",
"--strict",
action='store_true',
help="Use a more strict similarity for deduplication.",
)
Expand All @@ -102,42 +102,8 @@ def execute(self, argv):

# read data in ASReview data object
asdata = load_data(args_dedup.input_path)
initial_length = len(asdata.df)
deduplicate_data(asdata, args_dedup)

if args_dedup.pid not in asdata.df.columns:
print(
f"Not using {args_dedup.pid} for deduplication "
"because there is no such data."
)

# retrieve deduplicated ASReview data object
asdata.drop_duplicates(pid=args_dedup.pid, inplace=True)

if args_dedup.drop_similar:
drop_duplicates_by_similarity(
asdata,
args_dedup.threshold,
args_dedup.skip_abstract,
args_dedup.discard_stopwords,
args_dedup.stopwords_language,
args_dedup.strict_similarity,
args_dedup.verbose,
)

# count duplicates
n_dup = initial_length - len(asdata.df)

if args_dedup.output_path:
asdata.to_file(args_dedup.output_path)
print(
f"Removed {n_dup} duplicates from dataset with"
f" {initial_length} records."
)
else:
print(
f"Found {n_dup} duplicates in dataset with"
f" {initial_length} records."
)
if argv[0] == "compose":
args_compose_parser = _parse_arguments_compose()
args_compose = args_compose_parser.parse_args(argv[1:])
Expand Down