Skip to content

Commit

Permalink
Merge pull request #5578 from gabina/5547-outreachy-round-27-refactor…
Browse files Browse the repository at this point in the history
…-revision-score-importer-and-liftwing-api

[Outreachy Round 27] Refactor `LiftWingApi`  and `RevisionScoreImporter`
  • Loading branch information
ragesoss authored Jan 17, 2024
2 parents 14ff46e + 18c7a40 commit b99cb4f
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 230 deletions.
6 changes: 3 additions & 3 deletions app/controllers/revision_feedback_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ class RevisionFeedbackController < ApplicationController
def index
set_latest_revision_id
return if @rev_id.nil?
ores_data = RevisionScoreImporter.new.fetch_ores_data_for_revision_id(@rev_id)
@feedback = RevisionFeedbackService.new(ores_data[:features]).feedback
liftwing_data = RevisionScoreImporter.new.fetch_liftwing_data_for_revision_id(@rev_id)
@feedback = RevisionFeedbackService.new(liftwing_data[:features]).feedback
@user_feedback = Assignment.find(params['assignment_id']).assignment_suggestions
@rating = ores_data[:rating]
@rating = liftwing_data[:rating]
end

private
Expand Down
149 changes: 21 additions & 128 deletions lib/importers/revision_score_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,6 @@ class RevisionScoreImporter
################
# Entry points #
################
def self.update_revision_scores_for_all_wikis
LiftWingApi::AVAILABLE_WIKIPEDIAS.each do |language|
new(language:).update_revision_scores
new(language:).update_previous_revision_scores
end

new(language: nil, project: 'wikidata').update_revision_scores
new(language: nil, project: 'wikidata').update_previous_revision_scores
end

def self.update_revision_scores_for_course(course, update_service: nil)
course.wikis.each do |wiki|
next unless LiftWingApi.valid_wiki?(wiki)
Expand All @@ -38,11 +28,10 @@ def initialize(language: 'en', project: 'wikipedia', wiki: nil, course: nil, upd
end

# assumes a mediawiki rev_id from the correct Wikipedia
def fetch_ores_data_for_revision_id(rev_id)
articlequality_data = @lift_wing_api.get_revision_data([rev_id])
features = articlequality_data.dig(wiki_key, 'scores', rev_id.to_s, model_key, 'features')
rating = articlequality_data.dig(wiki_key, 'scores', rev_id.to_s, model_key,
'score', 'prediction')
def fetch_liftwing_data_for_revision_id(rev_id)
result = @lift_wing_api.get_revision_data([rev_id])
features = result.dig(rev_id.to_s, 'features')
rating = result.dig(rev_id.to_s, 'prediction')
return { features:, rating: }
end

Expand All @@ -69,38 +58,37 @@ def update_previous_revision_scores
##################
private

# The top-level key representing the wiki in LiftWing data
def wiki_key
# This assumes the project is Wikipedia, which is true for all wikis with the articlequality
# or the language is nil, which is the case for Wikidata.
@wiki_key ||= "#{@wiki.language || @wiki.project}wiki"
end

def model_key
@model_key ||= @wiki.project == 'wikidata' ? 'itemquality' : 'articlequality'
end

def get_and_save_scores(rev_batch)
scores_data = @lift_wing_api.get_revision_data rev_batch.map(&:mw_rev_id)
scores = scores_data.dig(wiki_key, 'scores') || {}
scores = @lift_wing_api.get_revision_data rev_batch.map(&:mw_rev_id)
save_scores(scores)
end

def get_and_save_previous_scores(rev_batch)
parent_revisions = get_parent_revisions(rev_batch)
return unless parent_revisions&.any?
parent_quality_data = @lift_wing_api.get_revision_data parent_revisions.values.map(&:to_i)
scores = parent_quality_data.dig(wiki_key, 'scores') || {}
scores = @lift_wing_api.get_revision_data parent_revisions.values.map(&:to_i)
save_parent_scores(parent_revisions, scores)
end

def save_scores(scores)
scores.each do |mw_rev_id, score|
revision = Revision.find_by(mw_rev_id: mw_rev_id.to_i, wiki_id: @wiki.id)
next unless revision
revision.wp10 = score.dig('wp10')
revision.features = score.dig('features')
# only modify the existing deleted value if revision was deleted
revision.deleted = true if score.dig('deleted')
revision.save
end
end

def save_parent_scores(parent_revisions, scores)
parent_revisions.each do |mw_rev_id, parent_id|
next unless scores.key? parent_id
article_completeness = weighted_mean_score(scores[parent_id])
features_previous = scores[parent_id]&.dig(model_key, 'features')
wp10_previous = scores.dig(parent_id, 'wp10')
features_previous = scores.dig(parent_id, 'features')
Revision.find_by(mw_rev_id: mw_rev_id.to_i, wiki: @wiki)
.update(wp10_previous: article_completeness, features_previous:)
.update(wp10_previous:, features_previous:)
end
end

Expand All @@ -119,17 +107,6 @@ def unscored_previous_revisions
mainspace_userspace_and_draft_revisions.where(features_previous: nil, new_article: false)
end

def save_scores(scores)
scores.each do |mw_rev_id, score|
revision = Revision.find_by(mw_rev_id: mw_rev_id.to_i, wiki_id: @wiki.id)
next unless revision
revision.wp10 = weighted_mean_score(score)
revision.features = score.dig(model_key, 'features')
revision.deleted = true if deleted?(score)
revision.save
end
end

def get_parent_revisions(rev_batch)
rev_query = parent_revisions_query rev_batch.map(&:mw_rev_id)
response = WikiApi.new(@wiki, @update_service).query rev_query
Expand Down Expand Up @@ -159,89 +136,5 @@ def parent_revisions_query(rev_ids)
rvprop: 'ids' }
end

# ORES articlequality ratings are often derived from the en.wiki system,
# so this is the fallback scheme.
ENWIKI_WEIGHTING = { 'FA' => 100,
'GA' => 80,
'B' => 60,
'C' => 40,
'Start' => 20,
'Stub' => 0 }.freeze
FRWIKI_WEIGHTING = { 'adq' => 100,
'ba' => 80,
'a' => 60,
'b' => 40,
'bd' => 20,
'e' => 0 }.freeze
TRWIKI_WEIGHTING = { 'sm' => 100,
'km' => 80,
'b' => 60,
'c' => 40,
'baslagıç' => 20,
'taslak' => 0 }.freeze
RUWIKI_WEIGHTING = { 'ИС' => 100,
'ДС' => 80,
'ХС' => 80,
'I' => 60,
'II' => 40,
'III' => 20,
'IV' => 0 }.freeze
PTWIKI_WEIGHTING = { '6' => 100,
'5' => 80,
'4' => 60,
'3' => 40,
'2' => 20,
'1' => 0 }.freeze
UKWIKI_WEIGHTING = { 'ДС' => 100,
'ВС' => 80,
'I' => 60,
'II' => 40,
'III' => 20,
'IV' => 0 }.freeze
# SV wiki has three high ratings, all of which are rare:
# This is just a guess at appropriate weighting for the case where almost
# all articles are the lowest tier.
SVWIKI_WEIGHTING = { 'u' => 100,
'b' => 90,
'r' => 80,
's' => 0 }.freeze
NLWIKI_WEIGHTING = { 'A' => 100,
'B' => 75,
'C' => 50,
'D' => 25,
'E' => 0 }.freeze
WEIGHTING_BY_LANGUAGE = {
'en' => ENWIKI_WEIGHTING,
'simple' => ENWIKI_WEIGHTING,
'fa' => ENWIKI_WEIGHTING,
'eu' => ENWIKI_WEIGHTING,
'fr' => FRWIKI_WEIGHTING,
'tr' => TRWIKI_WEIGHTING,
'ru' => RUWIKI_WEIGHTING,
'uk' => UKWIKI_WEIGHTING,
'gl' => ENWIKI_WEIGHTING,
'sv' => SVWIKI_WEIGHTING,
'nl' => NLWIKI_WEIGHTING,
'pt' => PTWIKI_WEIGHTING
}.freeze

def weighting
@weighting ||= WEIGHTING_BY_LANGUAGE[@wiki.language]
end

def weighted_mean_score(score)
probability = score&.dig('articlequality', 'score', 'probability')
return unless probability
mean = 0
weighting.each do |rating, weight|
mean += probability[rating] * weight
end
mean
end

def deleted?(score)
LiftWingApi::DELETED_REVISION_ERRORS.include? score.dig(model_key, 'error', 'type')
end

class InvalidWikiError < StandardError; end
end
88 changes: 57 additions & 31 deletions lib/lift_wing_api.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/errors/api_error_handling"
require_dependency "#{Rails.root}/lib/weighted_score_calculator"

# Gets data from Lift Wing
# Gets and processes data from Lift Wing
# https://wikitech.wikimedia.org/wiki/Machine_Learning/LiftWing
class LiftWingApi
include ApiErrorHandling
include WeightedScoreCalculator

DELETED_REVISION_ERRORS = %w[TextDeleted RevisionNotFound].freeze

Expand All @@ -22,31 +24,47 @@ def self.valid_wiki?(wiki)

def initialize(wiki, update_service = nil)
raise InvalidProjectError unless LiftWingApi.valid_wiki?(wiki)
@project_code = wiki.project == 'wikidata' ? 'wikidata' + 'wiki' : wiki.language + 'wiki'
@project_quality_model = wiki.project == 'wikidata' ? 'itemquality' : 'articlequality'
@wiki = wiki
@update_service = update_service
@errors = []
end

# Given an array of revision ids, it returns a hash with useful metrics for those
# revision ids.
# Format result example:
# { 'rev_id0' => { 'wp10' => 0.2915228958136511656e2, 'features' => features_value,
# 'deleted' => false, 'prediction' => 'Stub' }
# ...
# 'rev_idn' => { 'wp10' => 0.285936675221734978e2, 'features' => features_value,
# 'deleted' => false, 'prediction' => 'D' }
# }
def get_revision_data(rev_ids)
# Restart errors array
@errors = []
results = {}
rev_ids.each do |rev_id|
results.deep_merge! get_single_revision_data(rev_id)
results.deep_merge!({ rev_id.to_s => get_single_revision_parsed_data(rev_id) })
end

log_error_batch(rev_ids)

return results
end

def get_single_revision_data(rev_id)
private

# Returns a hash with wp10, features, deleted, and prediction, or empty hash if
# there is an error.
def get_single_revision_parsed_data(rev_id)
body = { rev_id:, extended_output: true }.to_json
response = lift_wing_server.post(quality_query_url, body)
parsed_response = Oj.load(response.body)
# If the responses contain an error, do not try to calculate wp10 or features.
if parsed_response.key? 'error'
return { 'wp10' => nil, 'features' => nil, 'deleted' => deleted?(parsed_response) }
end

return equivalent_ores_error_response(rev_id, parsed_response) if parsed_response.key? 'error'

parsed_response
build_successful_response(rev_id, parsed_response)
rescue StandardError => e
@errors << e
return {}
Expand All @@ -55,10 +73,19 @@ def get_single_revision_data(rev_id)
class InvalidProjectError < StandardError
end

private
# The top-level key representing the wiki in LiftWing data
def wiki_key
# This assumes the project is Wikipedia, which is true for all wikis with the articlequality
# or the language is nil, which is the case for Wikidata.
@wiki_key ||= "#{@wiki.language || @wiki.project}wiki"
end

def model_key
@model_key ||= @wiki.project == 'wikidata' ? 'itemquality' : 'articlequality'
end

def quality_query_url
"/service/lw/inference/v1/models/#{@project_code}-#{@project_quality_model}:predict"
"/service/lw/inference/v1/models/#{wiki_key}-#{model_key}:predict"
end

def lift_wing_server
Expand All @@ -72,25 +99,18 @@ def lift_wing_server
connection
end

# To make migration from ORES to LiftWing easier, we want responses to be in the same format,
# including error responses.
# For a deleted revision, ORES returns something like this:
# {"enwiki"=>{"scores"=>{"708326238"=>{"articlequality"=>{"error"=>
# {"message"=>"TextDeleted: Text deleted (datasource.revision.text)",
# "type"=>"TextDeleted"}}}}}}
# Lift Wing just returns something like:
# {"error"=>
# "Missing resource for rev-id 708326238: TextDeleted: Text deleted (datasource.revision.text)"}
ERROR_TYPE_MATCHER = Regexp.union DELETED_REVISION_ERRORS

def equivalent_ores_error_response(rev_id, error_response)
message = error_response['error']
type = message[ERROR_TYPE_MATCHER]

{ @project_code =>
{ 'scores' => { rev_id.to_s => { @project_quality_model => {
'error' => { 'message' => error_response['error'], 'type' => type }
} } } } }
def build_successful_response(rev_id, response)
score = response.dig(wiki_key, 'scores', rev_id.to_s, model_key)
{
# wp10 metric only makes sense to Wikipedia
'wp10' => (if @wiki.project == 'wikipedia'
weighted_mean_score(score&.dig('score', 'probability'),
@wiki.language)
end),
'features' => score.dig('features'),
'deleted' => false,
'prediction' => score.dig('score', 'prediction') # only for revision feedback
}
end

# TODO: monitor production for errors, understand them, put benign ones here
Expand All @@ -100,8 +120,14 @@ def log_error_batch(rev_ids)
return if @errors.empty?

log_error(@errors.first, update_service: @update_service,
sentry_extra: { rev_ids:, project_code: @project_code,
project_model: @project_quality_model,
sentry_extra: { rev_ids:, project_code: wiki_key,
project_model: model_key,
error_count: @errors.count })
end

def deleted?(response)
LiftWingApi::DELETED_REVISION_ERRORS.any? do |revision_error|
response.dig('error').include?(revision_error)
end
end
end
Loading

0 comments on commit b99cb4f

Please sign in to comment.