From 1f27910ee7fc1f1db2644bd49ae6af3505c2869f Mon Sep 17 00:00:00 2001 From: Caio Almeida <117518+caiosba@users.noreply.github.com> Date: Thu, 13 Feb 2025 17:22:39 -0300 Subject: [PATCH] Rake task to export text similarity data (#2207) Adding a new rake task that exports text similarity data from a set of workspaces and generates a JSON file output uploaded to an S3 bucket. References: CV2-6069 and CV2-6030. --- app/graph/types/project_media_type.rb | 8 - app/models/concerns/project_media_getters.rb | 8 + lib/tasks/data/text_similarity_data.rake | 173 +++++++++++++++++++ 3 files changed, 181 insertions(+), 8 deletions(-) create mode 100644 lib/tasks/data/text_similarity_data.rake diff --git a/app/graph/types/project_media_type.rb b/app/graph/types/project_media_type.rb index 05a09ed7b..99a5edd9f 100644 --- a/app/graph/types/project_media_type.rb +++ b/app/graph/types/project_media_type.rb @@ -240,16 +240,8 @@ def published field :language, GraphQL::Types::String, null: true - def language - object.get_dynamic_annotation("language")&.get_field "language"&.send(:to_s) - end - field :language_code, GraphQL::Types::String, null: true - def language_code - object.get_dynamic_annotation("language")&.get_field_value("language") - end - field :annotation, AnnotationType, null: true do argument :annotation_type, GraphQL::Types::String, required: true, camelize: false end diff --git a/app/models/concerns/project_media_getters.rb b/app/models/concerns/project_media_getters.rb index d775a8ea2..7d54a892f 100644 --- a/app/models/concerns/project_media_getters.rb +++ b/app/models/concerns/project_media_getters.rb @@ -219,4 +219,12 @@ def explainers_titles titles = Explainer.joins(:explainer_items).where('explainer_items.project_media_id = ?', self.id).map(&:title).join("\n") titles.blank? ? nil : titles end + + def language + self.get_dynamic_annotation('language')&.get_field('language')&.send(:to_s) + end + + def language_code + self.get_dynamic_annotation('language')&.get_field_value('language') + end end diff --git a/lib/tasks/data/text_similarity_data.rake b/lib/tasks/data/text_similarity_data.rake new file mode 100644 index 000000000..c23022baf --- /dev/null +++ b/lib/tasks/data/text_similarity_data.rake @@ -0,0 +1,173 @@ +# bundle exec rake check:data:text_similarity_data['s3_bucket_name=s3_bucket_name&workspace_slugs[]=workspace-slug-1&workspace_slugs[]=workspace-slug-2&...&workspace_slugs[]=workspace-slug-n'] + +namespace :check do + namespace :data do + desc 'Export text similarity data.' + task :text_similarity_data, [:query_string] => [:environment] do |_task, args| + ActiveRecord::Base.logger = nil + + def channel_name(item) + CheckChannels::ChannelCodes.all_channels.find{ |_k, v| v == item.channel.to_h['main'].to_i }.to_a[0] || + CheckChannels::ChannelCodes.all_channels['TIPLINE'].find{ |_k, v| v == item.channel.to_h['main'].to_i }.to_a[0] + end + + def origin_name(item) + CheckMediaClusterOrigins::OriginCodes.all_origins.find{ |_k, v| v == item.media_cluster_origin.to_i }.to_a[0] + end + + def standalone_article(article, type) + body_method_mapping = { + 'explainer' => :description, + 'fact-check' => :summary + } + claim = nil + claim = article.claim_description.description if type == 'fact-check' + { + id: nil, + team_id: nil, + team_slug: nil, + media_id: nil, + title: nil, + description: nil, + channel: nil, + origin: nil, + relationships: [], + articles: [{ + id: article.graphql_id, + title: article.title, + body: article.send(body_method_mapping[type]), + url: article.url, + type: type, + claim: claim, + created_at: article.created_at, + language: article.language + }] + } + end + + # Parse input parameters + options = Rack::Utils.parse_nested_query(args[:query_string]) + s3_bucket_name = options['s3_bucket_name'] + workspace_slugs = options['workspace_slugs'] + team_ids = Team.where(slug: workspace_slugs).map(&:id) + + # Structure for the data: Array of next project media objects like: + # { + # id: 1, + # title: 'Example', + # ..., + # articles: [...], + # relationships: [...], + # } + data = [] + + # Single-query approach (WIP / draft) + # query = ProjectMedia + # .joins(:team).where('teams.slug' => workspace_slugs) # Only items from the selected workspaces + # .joins('LEFT JOIN explainer_items ei ON ei.project_media_id = project_medias.id LEFT JOIN explainers e ON e.id = ei.explainer_id') # Includes explainers + # .joins('LEFT JOIN claim_descriptions cd ON cd.project_media_id = project_medias.id LEFT JOIN fact_checks fc ON fc.claim_description_id = cd.id') # Includes fact-checks + # .joins('LEFT JOIN relationships r ON r.source_id = project_medias.id OR r.target_id = project_medias.id') # Includes relationships + # query.to_sql + + # ActiveRecord approach + + # Get text items associated with explainers and fact-checks + query = ProjectMedia.joins(:media).where('project_medias.team_id' => team_ids, 'medias.type' => 'Claim') # Only text items from the selected workspaces + total = query.count + i = 0 + query.find_each do |item| + i += 1 + puts "[#{Time.now}] Exporting item #{i}/#{total}..." + + # Item data + object = { + id: item.id, + team_id: item.team_id, + team_slug: item.team.slug, + media_id: item.media_id, + title: item.title, + description: item.description, + channel: channel_name(item), + origin: origin_name(item), + created_at: item.created_at, + language: item.language_code + } + + # Explainers, if any + object[:articles] = [] + item.explainers.find_each do |explainer| + object[:articles] << { + id: explainer.graphql_id, + title: explainer.title, + body: explainer.description, + url: explainer.url, + claim: nil, + type: 'explainer', + created_at: explainer.created_at, + language: explainer.language + } + end + + # Fact-check, if any + unless item.fact_check.nil? + fact_check = item.fact_check + object[:articles] << { + id: fact_check.graphql_id, + title: fact_check.title, + body: fact_check.summary, + url: fact_check.url, + claim: fact_check.claim_description.description, + type: 'fact-check', + created_at: fact_check.created_at, + language: fact_check.language + } + end + + # Relationships + object[:relationships] = [] + Relationship.where('source_id = ? OR target_id = ?', item.id, item.id).find_each do |relationship| + object[:relationships] << { + parent_id: relationship.source_id, + child_id: relationship.target_id, + user_name: relationship.user&.name + }.merge(relationship.as_json) + end + + data << object + end + + # Complete the data with the explainers that are not associated to any item + Explainer.joins('LEFT JOIN explainer_items ei ON ei.explainer_id = explainers.id').where('ei.explainer_id IS NULL').where('explainers.team_id' => team_ids).find_each do |explainer| + data << standalone_article(explainer, 'explainer') + end + + # Complete the data with the fact-checks that are not associated to any item + FactCheck.joins(:claim_description).where('claim_descriptions.team_id' => team_ids, 'claim_descriptions.project_media_id' => nil).find_each do |fact_check| + data << standalone_article(fact_check, 'fact-check') + end + FactCheck.joins(claim_description: { project_media: :media }).where('claim_descriptions.team_id' => team_ids, 'medias.type' => 'Blank').find_each do |fact_check| + data << standalone_article(fact_check, 'fact-check') + end + + # Convert to JSON and upload to S3 + region = CheckConfig.get('storage_bucket_region') || 'eu-west-1' + begin + s3_client = Aws::S3::Client.new(region: region) + rescue Aws::Sigv4::Errors::MissingCredentialsError + puts 'Please provide the AWS credentials.' + exit 1 + end + response = s3_client.put_object( + bucket: s3_bucket_name, + key: "text-similarity-data-export-#{Time.now.strftime('%Y-%m-%d')}.json", + body: JSON.pretty_generate(data) + ) + if response.etag + puts 'Uploaded to S3 successfully.' + else + puts 'Error uploading to S3.' + exit 1 + end + end + end +end