From 905b37fa4b0245c403d5165c8951887308b0d1bd Mon Sep 17 00:00:00 2001 From: Caio Almeida <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 23:34:02 -0300 Subject: [PATCH] Be able to export a full list of media clusters. (#2024) The current export limit for media lists is 10.000 because this is the maximum size of a result window in ElasticSearch. The solution is to paginate the results. Fixes: CV2-5205. --- .codeclimate.yml | 2 +- lib/check_search.rb | 82 ++++++++++++++++++++++++------------ test/lib/list_export_test.rb | 45 +++++++++++++++++--- 3 files changed, 97 insertions(+), 32 deletions(-) diff --git a/.codeclimate.yml b/.codeclimate.yml index 773f85a550..ff19a8b13d 100644 --- a/.codeclimate.yml +++ b/.codeclimate.yml @@ -10,7 +10,7 @@ checks: enabled: false method-complexity: config: - threshold: 22 + threshold: 25 method-count: config: threshold: 65 diff --git a/lib/check_search.rb b/lib/check_search.rb index ce8746209e..12dc2e422b 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -89,6 +89,10 @@ def team Team.find_by_id(team_id) end + def feed + @feed + end + def teams [] end @@ -335,40 +339,66 @@ def medias_get_search_result(query) def self.get_exported_data(query, team_id) team = Team.find(team_id) + Team.current = team search = CheckSearch.new(query, nil, team_id) + feed_sharing_only_fact_checks = (search.feed && search.feed.data_points == [1]) # Prepare the export data = [] - header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags'] - fields = team.team_tasks.sort - fields.each { |tt| header << tt.label } + header = nil + if feed_sharing_only_fact_checks + header = ['Fact-check title', 'Fact-check summary', 'Fact-check URL', 'Tags', 'Workspace', 'Updated at', 'Rating'] + else + header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags'] + fields = team.team_tasks.sort + fields.each { |tt| header << tt.label } + end data << header - # No pagination for the export - search.set_option('esoffset', 0) - search.set_option('eslimit', CheckConfig.get(:export_csv_maximum_number_of_results, 10000, :integer)) - - # Iterate through each result and generate an output row for the CSV - search.medias.find_each do |pm| - row = [ - pm.claim_description&.description, - pm.full_url, - pm.status_i18n, - pm.author_name.to_s.gsub(/ \[.*\]$/, ''), - pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), - pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), - pm.linked_items_count, - pm.tags_as_sentence - ] - annotations = pm.get_annotations('task').map(&:load) - fields.each do |field| - annotation = annotations.find { |a| a.team_task_id == field.id } - answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') - answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end - row << answer + # Paginate + search_after = [0] + while !search_after.empty? + result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results + ids = result.collect{ |i| i['annotated_id'] }.uniq.compact.map(&:to_i) + + # Iterate through each result and generate an output row for the CSV + ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm| + row = nil + if feed_sharing_only_fact_checks + row = [ + pm.fact_check_title, + pm.fact_check_summary, + pm.fact_check_url, + pm.tags_as_sentence, + pm.team_name, + pm.updated_at_timestamp, + pm.status + ] + else + row = [ + pm.claim_description&.description, + pm.full_url, + pm.status_i18n, + pm.author_name.to_s.gsub(/ \[.*\]$/, ''), + pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), + pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), + pm.linked_items_count, + pm.tags_as_sentence + ] + annotations = pm.get_annotations('task').map(&:load) + fields.each do |field| + annotation = annotations.find { |a| a.team_task_id == field.id } + answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') + answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end + row << answer + end + end + data << row end - data << row + + search_after = [ids.max].compact end + data end diff --git a/test/lib/list_export_test.rb b/test/lib/list_export_test.rb index 668797d6e6..15551ba12d 100644 --- a/test/lib/list_export_test.rb +++ b/test/lib/list_export_test.rb @@ -26,21 +26,26 @@ def teardown end end - test "should export media CSV" do + test "should export media (including child media) CSV" do + setup_elasticsearch t = create_team create_team_task team_id: t.id, fieldset: 'tasks' - 2.times { create_project_media team: t } + parent = create_project_media team: t, disable_es_callbacks: false + child = create_project_media team: t, disable_es_callbacks: false + create_relationship source_id: parent.id, target_id: child.id, relationship_type: Relationship.confirmed_type - export = ListExport.new(:media, '{}', t.id) + sleep 2 # Wait for indexing + + export = ListExport.new(:media, { show_similar: true }.to_json, t.id) csv_url = export.generate_csv_and_send_email(create_user) response = Net::HTTP.get_response(URI(csv_url)) assert_equal 200, response.code.to_i csv_content = CSV.parse(response.body, headers: true) - assert_equal 2, csv_content.size assert_equal 2, export.number_of_rows + assert_equal 2, csv_content.size end - test "should export feed CSV" do + test "should export media feed CSV" do t = create_team f = create_feed team: t 2.times { f.clusters << create_cluster } @@ -54,6 +59,36 @@ def teardown assert_equal 2, export.number_of_rows end + test "should export fact-check feed CSV" do + setup_elasticsearch + RequestStore.store[:skip_cached_field_update] = false + + pender_url = CheckConfig.get('pender_url_private') + WebMock.stub_request(:get, /#{pender_url}/).to_return(body: '{}', status: 200) + + t = create_team + 2.times do + pm = create_project_media team: t, disable_es_callbacks: false + r = publish_report(pm, {}, nil, { language: 'en', use_visual_card: false }) + r = Dynamic.find(r.id) + r.disable_es_callbacks = false + r.set_fields = { state: 'published' }.to_json + r.save! + end + ss = create_saved_search team: t + f = create_feed team: t, data_points: [1], saved_search: ss, published: true + + sleep 2 # Wait for indexing + + export = ListExport.new(:media, { feed_id: f.id, feed_view: 'fact_check' }.to_json, t.id) + csv_url = export.generate_csv_and_send_email(create_user) + response = Net::HTTP.get_response(URI(csv_url)) + assert_equal 200, response.code.to_i + csv_content = CSV.parse(response.body, headers: true) + assert_equal 2, export.number_of_rows + assert_equal 2, csv_content.size + end + test "should export fact-checks CSV" do t = create_team 2.times do