Skip to content

Commit

Permalink
[Data rearchitecture] Handle duplicate articles for revisions (#6183)
Browse files Browse the repository at this point in the history
* Add resolve_duplicates_for_timeslices method in DuplicateArticleDeleter class

* Resolve duplicate articles after fetching new revisions and importing articles
  • Loading branch information
gabina authored Feb 7, 2025
1 parent 42a3448 commit e133aef
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 5 deletions.
18 changes: 18 additions & 0 deletions lib/duplicate_article_deleter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,24 @@ def resolve_duplicates(articles = nil)
ModifiedRevisionsManager.new(@wiki).move_or_delete_revisions limbo_revisions
end

def resolve_duplicates_for_timeslices(articles)
grouped = articles_grouped_by_title_and_namespace(articles)
@deleted_ids = []
grouped.each do |article_group|
delete_duplicates_in(article_group)
end

return if @deleted_ids.empty?

articles = Article.where(id: @deleted_ids)
# Get all courses with at least one deleted article
course_ids = ArticlesCourses.where(article_id: @deleted_ids).pluck(:course_id).uniq
course_ids.each do |course_id|
# Reset articles for every course involved
ArticlesCoursesCleanerTimeslice.reset_specific_articles(Course.find(course_id), articles)
end
end

#################
# Helper method #
#################
Expand Down
13 changes: 8 additions & 5 deletions lib/revision_data_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
require_dependency "#{Rails.root}/lib/importers/article_importer"
require_dependency "#{Rails.root}/app/helpers/encoding_helper"
require_dependency "#{Rails.root}/lib/importers/revision_score_importer"
require_dependency "#{Rails.root}/lib/duplicate_article_deleter"

#= Fetches revision data from API
class RevisionDataManager
Expand All @@ -30,8 +31,7 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end)

# Import articles. We do this here to avoid saving article data in memory.
# Note that we create articles for all sub data (not only for scoped revisions).
ArticleImporter.new(@wiki).import_articles_from_revision_data(articles)
@articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] })
import_and_resolve_duplicate_articles articles

# Prep: get a user dictionary for all users referred to by revisions.
users = user_dict_from_sub_data(all_sub_data)
Expand All @@ -44,9 +44,6 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end)
scoped_sub_data:,
articles: article_dict)

# TODO: resolve duplicates
# DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles)

# We need to partition revisions because we don't want to calculate scores for revisions
# out of important spaces
(revisions_in_spaces, revisions_out_spaces) = partition_revisions
Expand All @@ -68,6 +65,12 @@ def fetch_revision_data_for_users(users, timeslice_start, timeslice_end)
###########
private

def import_and_resolve_duplicate_articles(articles)
ArticleImporter.new(@wiki).import_articles_from_revision_data(articles)
@articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] })
DuplicateArticleDeleter.new(@wiki).resolve_duplicates_for_timeslices(@articles)
end

# Returns a list of revisions for users during the given period:
# [all_sub_data, sub_data].
# - all_sub_data: all revisions within the period.
Expand Down
94 changes: 94 additions & 0 deletions spec/lib/duplicate_article_deleter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require 'rails_helper'
require "#{Rails.root}/lib/duplicate_article_deleter"
require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice"

describe DuplicateArticleDeleter do
describe '.resolve_duplicates' do
Expand Down Expand Up @@ -79,4 +80,97 @@
expect(duplicate_deleted_article.reload.deleted).to eq(true)
end
end

describe '.resolve_duplicates_for_timeslices' do
let(:en_wiki) { Wiki.get_or_create(language: 'en', project: 'wikipedia') }
let(:deleted_article) do
create(:article, title: 'Columbian_exchange',
namespace: 0,
mw_page_id: 5589352,
created_at: '2017-08-23',
wiki: en_wiki,
deleted: true)
end
let(:extant_article) do
create(:article, title: 'Columbian_exchange',
namespace: 0,
mw_page_id: 622746,
created_at: '2017-12-08',
wiki: en_wiki,
deleted: false)
end
let(:duplicate_deleted_article) do
create(:article, title: 'Columbian_exchange',
namespace: 0,
mw_page_id: 622746,
created_at: '2017-12-08',
wiki: en_wiki,
deleted: true)
end
let!(:new_article) do
create(:article,
id: 2262715,
title: 'Kostanay',
namespace: 0,
created_at: 1.day.from_now)
end
let!(:duplicate_article) do
create(:article,
id: 46349871,
title: 'Kostanay',
namespace: 0,
created_at: 1.day.ago)
end

it 'marks oldest one deleted when there are two ids for one page' do
described_class.new.resolve_duplicates_for_timeslices([new_article])
undeleted = Article.where(
title: 'Kostanay',
namespace: 0,
deleted: false
)

expect(undeleted.count).to eq(1)
expect(undeleted.first.id).to eq(new_article.id)
end

it 'does not mark any deleted when articles different in title case' do
first = create(:article,
id: 123,
title: 'Communicative_language_teaching',
namespace: 0)
second = create(:article,
id: 456,
title: 'Communicative_Language_Teaching',
namespace: 0)
described_class.new.resolve_duplicates_for_timeslices([first, second])
expect(first.reload.deleted).to eq(false)
expect(second.reload.deleted).to eq(false)
end

it 'marks does not end up leaving both articles deleted' do
described_class.new.resolve_duplicates_for_timeslices([deleted_article, extant_article])
expect(deleted_article.reload.deleted).to eq(true)
expect(extant_article.reload.deleted).to eq(false)
end

it 'does not error if all articles are already deleted' do
described_class.new.resolve_duplicates_for_timeslices([deleted_article,
duplicate_deleted_article])
expect(deleted_article.reload.deleted).to eq(true)
expect(duplicate_deleted_article.reload.deleted).to eq(true)
end

it 'resets articles for all courses involved' do
course1 = create(:course)
course2 = create(:course, slug: 'Another course')

create(:articles_course, course_id: course1.id, article_id: new_article.id)
create(:articles_course, course_id: course2.id, article_id: duplicate_article.id)

expect(ArticlesCoursesCleanerTimeslice).to receive(:reset_specific_articles).once

described_class.new.resolve_duplicates_for_timeslices([new_article, extant_article])
end
end
end

0 comments on commit e133aef

Please sign in to comment.