diff --git a/lib/duplicate_article_deleter.rb b/lib/duplicate_article_deleter.rb index da48bada15..6b9d19d2eb 100644 --- a/lib/duplicate_article_deleter.rb +++ b/lib/duplicate_article_deleter.rb @@ -25,6 +25,24 @@ def resolve_duplicates(articles = nil) ModifiedRevisionsManager.new(@wiki).move_or_delete_revisions limbo_revisions end + def resolve_duplicates_for_timeslices(articles) + grouped = articles_grouped_by_title_and_namespace(articles) + @deleted_ids = [] + grouped.each do |article_group| + delete_duplicates_in(article_group) + end + + return if @deleted_ids.empty? + + articles = Article.where(id: @deleted_ids) + # Get all courses with at least one deleted article + course_ids = ArticlesCourses.where(article_id: @deleted_ids).pluck(:course_id).uniq + course_ids.each do |course_id| + # Reset articles for every course involved + ArticlesCoursesCleanerTimeslice.reset_specific_articles(Course.find(course_id), articles) + end + end + ################# # Helper method # ################# diff --git a/lib/revision_data_manager.rb b/lib/revision_data_manager.rb index ce5ca53b71..9c422553ac 100644 --- a/lib/revision_data_manager.rb +++ b/lib/revision_data_manager.rb @@ -4,6 +4,7 @@ require_dependency "#{Rails.root}/lib/importers/article_importer" require_dependency "#{Rails.root}/app/helpers/encoding_helper" require_dependency "#{Rails.root}/lib/importers/revision_score_importer" +require_dependency "#{Rails.root}/lib/duplicate_article_deleter" #= Fetches revision data from API class RevisionDataManager @@ -30,8 +31,7 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end) # Import articles. We do this here to avoid saving article data in memory. # Note that we create articles for all sub data (not only for scoped revisions). - ArticleImporter.new(@wiki).import_articles_from_revision_data(articles) - @articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] }) + import_and_resolve_duplicate_articles articles # Prep: get a user dictionary for all users referred to by revisions. users = user_dict_from_sub_data(all_sub_data) @@ -44,9 +44,6 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end) scoped_sub_data:, articles: article_dict) - # TODO: resolve duplicates - # DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles) - # We need to partition revisions because we don't want to calculate scores for revisions # out of important spaces (revisions_in_spaces, revisions_out_spaces) = partition_revisions @@ -68,6 +65,12 @@ def fetch_revision_data_for_users(users, timeslice_start, timeslice_end) ########### private + def import_and_resolve_duplicate_articles(articles) + ArticleImporter.new(@wiki).import_articles_from_revision_data(articles) + @articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] }) + DuplicateArticleDeleter.new(@wiki).resolve_duplicates_for_timeslices(@articles) + end + # Returns a list of revisions for users during the given period: # [all_sub_data, sub_data]. # - all_sub_data: all revisions within the period. diff --git a/spec/lib/duplicate_article_deleter_spec.rb b/spec/lib/duplicate_article_deleter_spec.rb index 1d823a1a88..db015bd42f 100644 --- a/spec/lib/duplicate_article_deleter_spec.rb +++ b/spec/lib/duplicate_article_deleter_spec.rb @@ -2,6 +2,7 @@ require 'rails_helper' require "#{Rails.root}/lib/duplicate_article_deleter" +require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice" describe DuplicateArticleDeleter do describe '.resolve_duplicates' do @@ -79,4 +80,97 @@ expect(duplicate_deleted_article.reload.deleted).to eq(true) end end + + describe '.resolve_duplicates_for_timeslices' do + let(:en_wiki) { Wiki.get_or_create(language: 'en', project: 'wikipedia') } + let(:deleted_article) do + create(:article, title: 'Columbian_exchange', + namespace: 0, + mw_page_id: 5589352, + created_at: '2017-08-23', + wiki: en_wiki, + deleted: true) + end + let(:extant_article) do + create(:article, title: 'Columbian_exchange', + namespace: 0, + mw_page_id: 622746, + created_at: '2017-12-08', + wiki: en_wiki, + deleted: false) + end + let(:duplicate_deleted_article) do + create(:article, title: 'Columbian_exchange', + namespace: 0, + mw_page_id: 622746, + created_at: '2017-12-08', + wiki: en_wiki, + deleted: true) + end + let!(:new_article) do + create(:article, + id: 2262715, + title: 'Kostanay', + namespace: 0, + created_at: 1.day.from_now) + end + let!(:duplicate_article) do + create(:article, + id: 46349871, + title: 'Kostanay', + namespace: 0, + created_at: 1.day.ago) + end + + it 'marks oldest one deleted when there are two ids for one page' do + described_class.new.resolve_duplicates_for_timeslices([new_article]) + undeleted = Article.where( + title: 'Kostanay', + namespace: 0, + deleted: false + ) + + expect(undeleted.count).to eq(1) + expect(undeleted.first.id).to eq(new_article.id) + end + + it 'does not mark any deleted when articles different in title case' do + first = create(:article, + id: 123, + title: 'Communicative_language_teaching', + namespace: 0) + second = create(:article, + id: 456, + title: 'Communicative_Language_Teaching', + namespace: 0) + described_class.new.resolve_duplicates_for_timeslices([first, second]) + expect(first.reload.deleted).to eq(false) + expect(second.reload.deleted).to eq(false) + end + + it 'marks does not end up leaving both articles deleted' do + described_class.new.resolve_duplicates_for_timeslices([deleted_article, extant_article]) + expect(deleted_article.reload.deleted).to eq(true) + expect(extant_article.reload.deleted).to eq(false) + end + + it 'does not error if all articles are already deleted' do + described_class.new.resolve_duplicates_for_timeslices([deleted_article, + duplicate_deleted_article]) + expect(deleted_article.reload.deleted).to eq(true) + expect(duplicate_deleted_article.reload.deleted).to eq(true) + end + + it 'resets articles for all courses involved' do + course1 = create(:course) + course2 = create(:course, slug: 'Another course') + + create(:articles_course, course_id: course1.id, article_id: new_article.id) + create(:articles_course, course_id: course2.id, article_id: duplicate_article.id) + + expect(ArticlesCoursesCleanerTimeslice).to receive(:reset_specific_articles).once + + described_class.new.resolve_duplicates_for_timeslices([new_article, extant_article]) + end + end end