From 0e95c0e85d4524dfdda8e1277ea245566a27e712 Mon Sep 17 00:00:00 2001 From: Mohamed El-Sawy Date: Tue, 11 Feb 2025 07:10:32 +0200 Subject: [PATCH] CV2-6038 refactor has claim search filter into has article filter (#2205) * CV2-6038: include explainer_title to has_article filter * CV2-6038: set value with nil instead of remove field from ES doc * CV2-6038: fix tests * CV2-6038: search by explainer_title field * CV2-6038: add rake task to migrate exisintg explainers * CV2-6038: use a single query for explainer titles * CV2-6038: apply PR comments --- app/lib/check_elastic_search.rb | 26 ++------- app/models/annotations/embed.rb | 22 -------- app/models/claim_description.rb | 26 ++++----- .../concerns/project_media_cached_fields.rb | 1 + app/models/concerns/project_media_getters.rb | 6 ++ app/models/explainer_item.rb | 19 +++++++ app/repositories/media_search.rb | 2 + app/workers/elastic_search_worker.rb | 1 - ...9_add_mapping_for_explainer_title_field.rb | 13 +++++ lib/check_search.rb | 26 +++++---- ...add_mapping_for_explainer_title_field.rake | 21 +++++++ test/controllers/elastic_search_7_test.rb | 20 +++---- test/controllers/elastic_search_9_test.rb | 56 ++++++++++++++++--- 13 files changed, 151 insertions(+), 88 deletions(-) create mode 100644 db/migrate/20250205224319_add_mapping_for_explainer_title_field.rb create mode 100644 lib/tasks/migrate/20250205224319_add_mapping_for_explainer_title_field.rake diff --git a/app/lib/check_elastic_search.rb b/app/lib/check_elastic_search.rb index b43a44ed9c..7a6fd94bd6 100644 --- a/app/lib/check_elastic_search.rb +++ b/app/lib/check_elastic_search.rb @@ -37,13 +37,6 @@ def update_elasticsearch_doc(keys, data = {}, pm_id = nil, skip_get_data = false ElasticSearchWorker.perform_in(1.second, YAML::dump(model), YAML::dump(options), 'update_doc') end - def remove_fields_from_elasticsearch_doc(keys, pm_id) - return if self.disable_es_callbacks || RequestStore.store[:disable_es_callbacks] - options = { keys: keys, pm_id: pm_id } - model = { klass: self.class.name, id: self.id } - ElasticSearchWorker.perform_in(1.second, YAML::dump(model), YAML::dump(options), 'remove_fields') - end - def update_recent_activity(obj) # update `updated_at` date for both PG & ES updated_at = Time.now @@ -56,13 +49,11 @@ def update_elasticsearch_doc_bg(options) data = get_elasticsearch_data(options[:data], options[:skip_get_data]) fields = {} options[:keys].each do |k| - unless data[k].nil? - if data[k].class.to_s == 'Hash' - value = get_fresh_value(data[k].with_indifferent_access) - fields[k] = value unless value.nil? - else - fields[k] = data[k] - end + if data[k].class.to_s == 'Hash' + value = get_fresh_value(data[k].with_indifferent_access) + fields[k] = value + else + fields[k] = data[k] end end if fields.count @@ -72,13 +63,6 @@ def update_elasticsearch_doc_bg(options) end end - def remove_fields_from_elasticsearch_doc_bg(options) - options[:keys].each do |k| - source = "ctx._source.remove('#{k}')" - $repository.client.update index: CheckElasticSearchModel.get_index_alias, id: options[:doc_id], body: { script: { source: source } } - end - end - # Get a fresh data based on data(Hash) def get_fresh_value(data) value = data['default'] diff --git a/app/models/annotations/embed.rb b/app/models/annotations/embed.rb index 485fcf119a..90682701f4 100644 --- a/app/models/annotations/embed.rb +++ b/app/models/annotations/embed.rb @@ -5,8 +5,6 @@ class Embed < Dynamic end Dynamic.class_eval do - after_commit :update_elasticsearch_metadata, on: [:create, :update], if: proc { |d| ['metadata', 'verification_status'].include?(d.annotation_type) } - def title=(title) self.set_metadata_field('title', title) end @@ -36,26 +34,6 @@ def get_metadata_field(field) data[field.to_s] end - def update_elasticsearch_metadata - unless self.annotated.nil? - keys = %w(title description) - if self.annotated_type == 'ProjectMedia' - self.update_es_metadata_pm_annotation(keys, self.annotated) - elsif self.annotated_type == 'Media' && self.annotated.type == 'Link' - self.annotated.project_medias.each do |pm| - m = pm.get_annotations('metadata').last - self.update_elasticsearch_doc(keys, { 'title' => pm.title, 'description' => pm.description }, pm.id) if m.nil? - end - end - end - end - - def update_es_metadata_pm_annotation(keys, pm) - data = {} - keys.each { |k| data[k] = self.send(k) } - self.update_elasticsearch_doc(keys, data, pm.id) - end - def metadata_for_registration_account(data) return nil unless self.annotation_type == 'metadata' unless data.nil? diff --git a/app/models/claim_description.rb b/app/models/claim_description.rb index cd9352ea83..08abeffbd4 100644 --- a/app/models/claim_description.rb +++ b/app/models/claim_description.rb @@ -36,12 +36,14 @@ def text_fields def article_elasticsearch_data(action = 'create_or_update') return if self.project_media_id.nil? || self.disable_es_callbacks || RequestStore.store[:disable_es_callbacks] - if action == 'destroy' - self.remove_fields_from_elasticsearch_doc(['claim_description_content', 'claim_description_context'], self.project_media_id) - else - data = { 'claim_description_content' => self.description, 'claim_description_context' => self.context } - self.index_in_elasticsearch(self.project_media_id, data) - end + data = action == 'destroy' ? { + 'claim_description_content' => nil, + 'claim_description_context' => nil + } : { + 'claim_description_content' => self.description, + 'claim_description_context' => self.context + } + self.index_in_elasticsearch(self.project_media_id, data) end def project_media_was @@ -90,13 +92,11 @@ def update_report fact_check.save! end # update ES - # Remove claim_description fields - self.remove_fields_from_elasticsearch_doc(['claim_description_content', 'claim_description_context'], pm.id) + # clear claim_description fields + data = { 'claim_description_content' => nil, 'claim_description_context' => nil } # clear fact-check values - unless self.fact_check.nil? - data = { 'fact_check_title' => '', 'fact_check_summary' => '', 'fact_check_url' => '', 'fact_check_languages' => [] } - self.fact_check.index_in_elasticsearch(pm.id, data) - end + data.merge!({ 'fact_check_title' => '', 'fact_check_summary' => '', 'fact_check_url' => '', 'fact_check_languages' => [] }) unless self.fact_check.nil? + self.index_in_elasticsearch(pm.id, data) end end @@ -128,7 +128,7 @@ def migrate_claim_and_fact_check_logs def log_relevant_article_results fc = self.fact_check - self.project_media.delay.log_relevant_results(fc.class.name, fc.id, User.current&.id, self.class.actor_session_id) + self.project_media.delay.log_relevant_results(fc.class.name, fc.id, User.current&.id, self.class.actor_session_id) unless fc.nil? end def cant_apply_article_to_item_if_article_is_in_the_trash diff --git a/app/models/concerns/project_media_cached_fields.rb b/app/models/concerns/project_media_cached_fields.rb index 93acdafee6..29e06268f4 100644 --- a/app/models/concerns/project_media_cached_fields.rb +++ b/app/models/concerns/project_media_cached_fields.rb @@ -214,6 +214,7 @@ def title_or_description_update update_on: FACT_CHECK_EVENTS cached_field :description, + update_es: true, recalculate: :recalculate_description, update_on: title_or_description_update diff --git a/app/models/concerns/project_media_getters.rb b/app/models/concerns/project_media_getters.rb index 0ce240c337..d775a8ea23 100644 --- a/app/models/concerns/project_media_getters.rb +++ b/app/models/concerns/project_media_getters.rb @@ -213,4 +213,10 @@ def team_avatar def fact_check self.claim_description&.fact_check end + + def explainers_titles + # Get the title for all explainer assigned to the item + titles = Explainer.joins(:explainer_items).where('explainer_items.project_media_id = ?', self.id).map(&:title).join("\n") + titles.blank? ? nil : titles + end end diff --git a/app/models/explainer_item.rb b/app/models/explainer_item.rb index 7359df17de..7051a5e021 100644 --- a/app/models/explainer_item.rb +++ b/app/models/explainer_item.rb @@ -10,6 +10,7 @@ class ExplainerItem < ApplicationRecord validate :cant_apply_article_to_item_if_article_is_in_the_trash after_create :log_relevant_article_results + after_commit :update_elasticsearch_data def version_metadata(_changes) { explainer_title: self.explainer.title }.to_json @@ -25,6 +26,24 @@ def cant_apply_article_to_item_if_article_is_in_the_trash errors.add(:base, I18n.t(:cant_apply_article_to_item_if_article_is_in_the_trash)) if self.explainer&.trashed end + def update_elasticsearch_data + return if self.disable_es_callbacks || RequestStore.store[:disable_es_callbacks] + pm = self.project_media + # touch item to update `updated_at` date + if ProjectMedia.exists?(pm.id) + updated_at = Time.now + pm.update_columns(updated_at: updated_at) + data = { updated_at: updated_at.utc } + data['explainer_title'] = { + method: "explainers_titles", + klass: pm.class.name, + id: pm.id, + default: nil, + } + pm.update_elasticsearch_doc(data.keys, data, pm.id, true) + end + end + def log_relevant_article_results ex = self.explainer self.project_media.delay.log_relevant_results(ex.class.name, ex.id, User.current&.id, self.class.actor_session_id) diff --git a/app/repositories/media_search.rb b/app/repositories/media_search.rb index ea534bc737..7ada8a0af0 100644 --- a/app/repositories/media_search.rb +++ b/app/repositories/media_search.rb @@ -132,5 +132,7 @@ class MediaSearch indexes :negative_tipline_search_results_count, { type: 'long' } indexes :tipline_search_results_count, { type: 'long' } + + indexes :explainer_title, { type: 'text', analyzer: 'check' } end end diff --git a/app/workers/elastic_search_worker.rb b/app/workers/elastic_search_worker.rb index 33a79d5650..87ca4da87c 100644 --- a/app/workers/elastic_search_worker.rb +++ b/app/workers/elastic_search_worker.rb @@ -15,7 +15,6 @@ def perform(model_data, options, type) ops = { 'create_doc' => 'create_elasticsearch_doc_bg', 'update_doc' => 'update_elasticsearch_doc_bg', - 'remove_fields' => 'remove_fields_from_elasticsearch_doc_bg', 'update_doc_team' => 'update_elasticsearch_doc_team_bg', 'create_update_doc_nested' => 'create_update_nested_obj_bg', 'destroy_doc' => 'destroy_elasticsearch_doc', diff --git a/db/migrate/20250205224319_add_mapping_for_explainer_title_field.rb b/db/migrate/20250205224319_add_mapping_for_explainer_title_field.rb new file mode 100644 index 0000000000..3c1a60ff0a --- /dev/null +++ b/db/migrate/20250205224319_add_mapping_for_explainer_title_field.rb @@ -0,0 +1,13 @@ +class AddMappingForExplainerTitleField < ActiveRecord::Migration[6.1] + def change + options = { + index: CheckElasticSearchModel.get_index_alias, + body: { + properties: { + explainer_title: { type: 'text', analyzer: 'check' }, + } + } + } + $repository.client.indices.put_mapping options + end +end diff --git a/lib/check_search.rb b/lib/check_search.rb index c14d388ebd..50ced2a057 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -168,7 +168,7 @@ def should_hit_elasticsearch? end filters_blank = true ['tags', 'keyword', 'language', 'fc_language', 'request_language', 'report_language', 'team_tasks', 'assigned_to', 'report_status', 'range_numeric', - 'has_claim', 'cluster_teams', 'published_by', 'annotated_by', 'channels', 'cluster_published_reports' + 'has_article', 'cluster_teams', 'published_by', 'annotated_by', 'channels', 'cluster_published_reports' ].each do |filter| filters_blank = false unless @options[filter].blank? end @@ -309,7 +309,7 @@ def build_es_medias_query custom_conditions.concat integer_terms_query('channel', 'channels') custom_conditions.concat integer_terms_query('source_id', 'sources') custom_conditions.concat doc_conditions - custom_conditions.concat has_claim_conditions + custom_conditions.concat has_article_conditions custom_conditions.concat file_filter custom_conditions.concat range_filter(:es) custom_conditions.concat numeric_range_filter @@ -469,8 +469,7 @@ def keyword_conditions return [] if @options["keyword"].blank? || @options["keyword"].class.name != 'String' set_keyword_fields keyword_c = [] - field_conditions = build_keyword_conditions_media_fields - keyword_c.concat field_conditions + keyword_c.concat build_keyword_conditions_media_fields # Search in requests [['request_username', 'username'], ['request_identifier', 'identifier'], ['request_content', 'content']].each do |pair| keyword_c << { @@ -497,7 +496,7 @@ def set_keyword_fields def build_keyword_conditions_media_fields es_fields = [] conditions = [] - %w(title description url claim_description_content fact_check_title fact_check_summary claim_description_context fact_check_url source_name).each do |f| + %w(title description url claim_description_content fact_check_title fact_check_summary claim_description_context fact_check_url source_name explainer_title).each do |f| es_fields << f if should_include_keyword_field?(f) end es_fields << 'analysis_title' if should_include_keyword_field?('title') @@ -530,13 +529,18 @@ def request_language_conditions [{ nested: { path: 'requests', query: { terms: { 'requests.language': @options['request_language'] } } } }] end - def has_claim_conditions + def has_article_conditions conditions = [] - return conditions unless @options.has_key?('has_claim') - if @options['has_claim'].include?('NO_VALUE') - conditions << { bool: { must_not: [ { exists: { field: 'claim_description_content' } } ] } } - elsif @options['has_claim'].include?('ANY_VALUE') - conditions << { exists: { field: 'claim_description_content' } } + return conditions unless @options.has_key?('has_article') + # Build a condidtion with fields that define the item has_article + has_article_c = [] + ['claim_description_content', 'explainer_title'].each do |field| + has_article_c << { exists: { field: field } } + end + if @options['has_article'].include?('NO_VALUE') + conditions << { bool: { must_not: has_article_c } } + elsif @options['has_article'].include?('ANY_VALUE') + conditions << { bool: { should: has_article_c } } end conditions end diff --git a/lib/tasks/migrate/20250205224319_add_mapping_for_explainer_title_field.rake b/lib/tasks/migrate/20250205224319_add_mapping_for_explainer_title_field.rake new file mode 100644 index 0000000000..9095db5c60 --- /dev/null +++ b/lib/tasks/migrate/20250205224319_add_mapping_for_explainer_title_field.rake @@ -0,0 +1,21 @@ +namespace :check do + namespace :migrate do + task project_media_explainer_title: :environment do + started = Time.now.to_i + index_alias = CheckElasticSearchModel.get_index_alias + skip_pmids = [] + es_body = [] + ExplainerItem.find_each do |raw| + next if skip_pmids.include?(raw.project_media_id) + pm = raw.project_media + doc_id = Base64.encode64("ProjectMedia/#{pm.id}") + fields = { 'explainer_title' => pm.explainers_titles } + es_body << { update: { _index: index_alias, _id: doc_id, retry_on_conflict: 3, data: { doc: fields } } } + skip_pmids << pm.id + end + $repository.client.bulk body: es_body unless es_body.blank? + minutes = ((Time.now.to_i - started) / 60).to_i + puts "[#{Time.now}] Done in #{minutes} minutes." + end + end +end \ No newline at end of file diff --git a/test/controllers/elastic_search_7_test.rb b/test/controllers/elastic_search_7_test.rb index 6788b0963f..4a239f660a 100644 --- a/test/controllers/elastic_search_7_test.rb +++ b/test/controllers/elastic_search_7_test.rb @@ -170,10 +170,10 @@ def setup assert_equal [pm.id, pm2.id], result.medias.map(&:id).sort end - test "should filter by keyword and tite description tags and notes fields fields" do + test "should filter by keyword and field settings(tite description tags)" do + RequestStore.store[:skip_cached_field_update] = false create_verification_status_stuff(false) t = create_team - p = create_project team: t pender_url = CheckConfig.get('pender_url_private') + '/api/medias' url = 'http://test.com' response = '{"type":"media","data":{"url":"' + url + '/normalized","type":"item", "title": "search_title", "description":"search_desc"}}' @@ -183,20 +183,20 @@ def setup WebMock.stub_request(:get, pender_url).with({ query: { url: url2 } }).to_return(body: response) m = create_media(account: create_valid_account, url: url) m1 = create_media(account: create_valid_account, url: url2) - pm = create_project_media project: p, media: m, disable_es_callbacks: false - pm2 = create_project_media project: p, media: m1, disable_es_callbacks: false + pm = create_project_media team: t, media: m, disable_es_callbacks: false + pm2 = create_project_media team: t, media: m1, disable_es_callbacks: false # add analysis to pm2 pm2.analysis = { title: 'override_title', content: 'override_description' } # add tags to pm3 - pm3 = create_project_media project: p, disable_es_callbacks: false + pm3 = create_project_media team: t, disable_es_callbacks: false create_tag tag: 'search_title', annotated: pm3, disable_es_callbacks: false create_tag tag: 'another_desc', annotated: pm3, disable_es_callbacks: false create_tag tag: 'newtag', annotated: pm3, disable_es_callbacks: false - sleep 2 + sleep 1 result = CheckSearch.new({keyword: 'search_title'}.to_json, nil, t.id) - assert_equal [pm.id, pm2.id, pm3.id], result.medias.map(&:id).sort + assert_equal [pm.id, pm3.id], result.medias.map(&:id).sort result = CheckSearch.new({keyword: 'search_title', keyword_fields: {fields: ['title']}}.to_json, nil, t.id) - assert_equal [pm.id, pm2.id], result.medias.map(&:id).sort + assert_equal [pm.id], result.medias.map(&:id) result = CheckSearch.new({keyword: 'search_desc', keyword_fields: {fields: ['description']}}.to_json, nil, t.id) assert_equal [pm.id], result.medias.map(&:id) result = CheckSearch.new({keyword: 'override_title', keyword_fields: {fields: ['title']}}.to_json, nil, t.id) @@ -206,9 +206,7 @@ def setup result = CheckSearch.new({keyword: 'search_title', keyword_fields: {fields: ['tags']}}.to_json, nil, t.id) assert_equal [pm3.id], result.medias.map(&:id) result = CheckSearch.new({keyword: 'another_desc', keyword_fields: {fields:['description', 'tags']}}.to_json, nil, t.id) - assert_equal [pm2.id, pm3.id], result.medias.map(&:id).sort - create_comment annotated: pm, text: 'item notepm', disable_es_callbacks: false - create_comment annotated: pm2, text: 'item comment', disable_es_callbacks: false + assert_equal [pm3.id], result.medias.map(&:id) end test "should search by media url" do diff --git a/test/controllers/elastic_search_9_test.rb b/test/controllers/elastic_search_9_test.rb index 14993cc7bd..e35d5aa71b 100644 --- a/test/controllers/elastic_search_9_test.rb +++ b/test/controllers/elastic_search_9_test.rb @@ -6,28 +6,66 @@ def setup setup_elasticsearch end - test "should filter items by has_claim" do + test "should filter items by has_article" do t = create_team p = create_project team: t u = create_user create_team_user team: t, user: u, role: 'admin' with_current_user_and_team(u ,t) do - pm = create_project_media team: t, disable_es_callbacks: false + pm = create_project_media team: t, quote: 'explainer_search', disable_es_callbacks: false pm2 = create_project_media team: t, disable_es_callbacks: false pm3 = create_project_media team: t, disable_es_callbacks: false pm4 = create_project_media team: t, disable_es_callbacks: false + pm5 = create_project_media team: t, disable_es_callbacks: false cd = create_claim_description project_media: pm, disable_es_callbacks: false - create_claim_description project_media: pm3, disable_es_callbacks: false + ex2_a = create_explainer team: t + ex2_b = create_explainer team: t + ex3 = create_explainer team: t + pm2.explainers << ex2_a + pm2.explainers << ex2_b + pm3.explainers << ex3 + ex4 = create_explainer team: t, title: 'explainer_search' + cd4 = create_claim_description project_media: pm4, disable_es_callbacks: false + pm4.explainers << ex4 + sleep 1 + results = CheckSearch.new({ has_article: ['ANY_VALUE'] }.to_json) + assert_equal [pm.id, pm2.id, pm3.id, pm4.id], results.medias.map(&:id).sort + results = CheckSearch.new({ has_article: ['NO_VALUE'] }.to_json) + assert_equal [pm5.id], results.medias.map(&:id).sort + # remove explainer + ExplainerItem.where(explainer_id: ex2_a.id, project_media_id: pm2.id).destroy_all + ExplainerItem.where(explainer_id: ex3.id, project_media_id: pm3.id).destroy_all + cd4.project_media_id = nil + cd4.save! sleep 1 - results = CheckSearch.new({ has_claim: ['ANY_VALUE'] }.to_json) - assert_equal [pm.id, pm3.id], results.medias.map(&:id).sort - results = CheckSearch.new({ has_claim: ['NO_VALUE'] }.to_json) - assert_equal [pm2.id, pm4.id], results.medias.map(&:id).sort + results = CheckSearch.new({ has_article: ['ANY_VALUE'] }.to_json) + assert_equal [pm.id, pm2.id, pm4.id], results.medias.map(&:id).sort + results = CheckSearch.new({ has_article: ['NO_VALUE'] }.to_json) + assert_equal [pm3.id, pm5.id], results.medias.map(&:id).sort + # remove fact-check and explainer cd.project_media_id = nil cd.save! + ExplainerItem.where(explainer_id: ex2_b.id, project_media_id: pm2.id).destroy_all + ExplainerItem.where(explainer_id: ex4.id, project_media_id: pm4.id).destroy_all sleep 1 - results = CheckSearch.new({ has_claim: ['NO_VALUE'] }.to_json) - assert_equal [pm.id, pm2.id, pm4.id], results.medias.map(&:id).sort + results = CheckSearch.new({ has_article: ['ANY_VALUE'] }.to_json) + assert_empty results.medias.map(&:id) + results = CheckSearch.new({ has_article: ['NO_VALUE'] }.to_json) + assert_equal [pm.id, pm2.id, pm3.id, pm4.id, pm5.id], results.medias.map(&:id).sort + # re-assing fact or explainer + cd.project_media_id = pm2.id + cd.save! + pm5.explainers << ex4 + sleep 1 + results = CheckSearch.new({ has_article: ['ANY_VALUE'] }.to_json) + assert_equal [pm2.id, pm5.id], results.medias.map(&:id).sort + results = CheckSearch.new({ has_article: ['NO_VALUE'] }.to_json) + assert_equal [pm.id, pm3.id, pm4.id], results.medias.map(&:id).sort + # Verify search by explainer_title field + result = CheckSearch.new({keyword: 'explainer_search'}.to_json, nil, t.id) + assert_equal [pm.id, pm5.id], result.medias.map(&:id).sort + result = CheckSearch.new({keyword: 'explainer_search', keyword_fields: {fields: ['explainer_title']}}.to_json, nil, t.id) + assert_equal [pm5.id], result.medias.map(&:id) end end