Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data rearchitecture] ArticleScopedProgram/VisitingScholarship: Handle changes in categories and assignments #6107

Prev Previous commit
Next Next commit
Refactor RevisionDataManager class
gabina committed Jan 13, 2025
commit 077e368b0c3efcbb2300ee05e4c66e419bbc9ef5
62 changes: 37 additions & 25 deletions lib/revision_data_manager.rb
Original file line number Diff line number Diff line change
@@ -21,14 +21,15 @@ def initialize(wiki, course, update_service: nil)
# Returns an array of Revision records.
# As a side effect, it imports Article records.
def fetch_revision_data_for_course(timeslice_start, timeslice_end)
all_sub_data, sub_data = get_course_revisions(@course.students, timeslice_start, timeslice_end)
all_sub_data, scoped_sub_data = get_course_revisions(@course.students, timeslice_start,
timeslice_end)
@revisions = []

# Extract all article data from the slice. Outputs a hash with article attrs.
articles = sub_data_to_article_attributes(all_sub_data)

# Import articles. We do this here to avoid saving article data in memory.
# Note that we create articles for all sub data (not only for filtered ones).
# Note that we create articles for all sub data (not only for scoped revisions).
ArticleImporter.new(@wiki).import_articles_from_revision_data(articles)
@articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] })

@@ -38,8 +39,10 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end)
# Now get all the revisions
# We need a slightly different article dictionary format here
article_dict = @articles.each_with_object({}) { |a, memo| memo[a.mw_page_id] = a.id }
@revisions = sub_data_to_revision_attributes(all_sub_data, users, filtered: sub_data,
articles: article_dict)
@revisions = sub_data_to_revision_attributes(all_sub_data,
users,
scoped_sub_data:,
articles: article_dict)

# TODO: resolve duplicates
# DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles)
@@ -54,10 +57,10 @@ def fetch_revision_data_for_course(timeslice_start, timeslice_end)
# This method gets revisions for some specific users.
# It does not fetch scores. It has no side effects.
def fetch_revision_data_for_users(users, timeslice_start, timeslice_end)
_, sub_data = get_course_revisions(users, timeslice_start, timeslice_end)
users = user_dict_from_sub_data(sub_data)
all_sub_data, scoped_sub_data = get_course_revisions(users, timeslice_start, timeslice_end)
users = user_dict_from_sub_data(all_sub_data)

sub_data_to_revision_attributes(sub_data, users)
sub_data_to_revision_attributes(all_sub_data, users, scoped_sub_data:)
end

###########
@@ -68,7 +71,7 @@ def fetch_revision_data_for_users(users, timeslice_start, timeslice_end)
# Returns a list of revisions for users during the given period:
# [all_sub_data, sub_data].
# - all_sub_data: all revisions within the period.
# - sub_data: revisions filtered based on the course type.
# - scoped_sub_data: revisions filtered based on the course type.
def get_course_revisions(users, start, end_date)
all_sub_data = get_revisions(users, start, end_date)
# Filter revisions based on the course type.
@@ -117,36 +120,45 @@ def user_dict_from_sub_data(sub_data)
User.where(username: users).pluck(:username, :id).to_h
end

def sub_data_to_revision_attributes(all_sub_data, users, filtered: nil, articles: nil)
# Returns revisions from all_sub_data.
# scoped_sub_data contains filtered data according to the course type.
def sub_data_to_revision_attributes(all_sub_data, users, scoped_sub_data: nil, articles: nil)
all_sub_data.flat_map do |_a_id, article_data|
article_data['revisions'].map do |rev_data|
mw_page_id = rev_data['mw_page_id'].to_i
article_id = articles.nil? ? nil : articles[mw_page_id]
Revision.new({
mw_rev_id: rev_data['mw_rev_id'],
date: rev_data['date'],
characters: rev_data['characters'],
article_id:, mw_page_id:,
user_id: users[rev_data['username']],
new_article: string_to_boolean(rev_data['new_article']),
system: string_to_boolean(rev_data['system']),
wiki_id: rev_data['wiki_id'],
views: revision_filtered?(filtered, mw_page_id, rev_data['mw_rev_id'])
})
create_revision(rev_data, scoped_sub_data, users, articles)
end
end.uniq(&:mw_rev_id)
end

def revision_filtered?(data, mw_page_id, mw_rev_id)
return false if data.nil?
data.any? do |_, entry|
def scoped_revision?(scoped_sub_data, mw_page_id, mw_rev_id)
scoped_sub_data.any? do |_, entry|
next unless entry.is_a?(Hash) && entry['article'] && entry['revisions']

entry['article']['mw_page_id'] == mw_page_id.to_s &&
entry['revisions'].any? { |rev| rev['mw_rev_id'] == mw_rev_id.to_s }
end
end

# Creates a revision record for the given revision data.
# Note that views field is currently used to track if the revision
# is a scoped one.
# TODO: change the field name. Review this
def create_revision(rev_data, scoped_sub_data, users, articles)
mw_page_id = rev_data['mw_page_id'].to_i
Revision.new({
mw_rev_id: rev_data['mw_rev_id'],
date: rev_data['date'],
characters: rev_data['characters'],
article_id: articles.nil? ? nil : articles[mw_page_id],
mw_page_id:,
user_id: users[rev_data['username']],
new_article: string_to_boolean(rev_data['new_article']),
system: string_to_boolean(rev_data['system']),
wiki_id: rev_data['wiki_id'],
views: scoped_revision?(scoped_sub_data, mw_page_id, rev_data['mw_rev_id'])
})
end

# Partition revisions between those belonging to articles in/out of mainspace/userspace/draftspace
# We need this to avoid calculating scores for articles out of pertinent spaces
# Returns [revisions_in_spaces, revisions_out_spaces]