From 1f27910ee7fc1f1db2644bd49ae6af3505c2869f Mon Sep 17 00:00:00 2001
From: Caio Almeida <117518+caiosba@users.noreply.github.com>
Date: Thu, 13 Feb 2025 17:22:39 -0300
Subject: [PATCH] Rake task to export text similarity data (#2207)

Adding a new rake task that exports text similarity data from a set of workspaces and generates a JSON file output uploaded to an S3 bucket.

References: CV2-6069 and CV2-6030.
---
 app/graph/types/project_media_type.rb        |   8 -
 app/models/concerns/project_media_getters.rb |   8 +
 lib/tasks/data/text_similarity_data.rake     | 173 +++++++++++++++++++
 3 files changed, 181 insertions(+), 8 deletions(-)
 create mode 100644 lib/tasks/data/text_similarity_data.rake

diff --git a/app/graph/types/project_media_type.rb b/app/graph/types/project_media_type.rb
index 05a09ed7b..99a5edd9f 100644
--- a/app/graph/types/project_media_type.rb
+++ b/app/graph/types/project_media_type.rb
@@ -240,16 +240,8 @@ def published
 
   field :language, GraphQL::Types::String, null: true
 
-  def language
-    object.get_dynamic_annotation("language")&.get_field "language"&.send(:to_s)
-  end
-
   field :language_code, GraphQL::Types::String, null: true
 
-  def language_code
-    object.get_dynamic_annotation("language")&.get_field_value("language")
-  end
-
   field :annotation, AnnotationType, null: true do
     argument :annotation_type, GraphQL::Types::String, required: true, camelize: false
   end
diff --git a/app/models/concerns/project_media_getters.rb b/app/models/concerns/project_media_getters.rb
index d775a8ea2..7d54a892f 100644
--- a/app/models/concerns/project_media_getters.rb
+++ b/app/models/concerns/project_media_getters.rb
@@ -219,4 +219,12 @@ def explainers_titles
     titles = Explainer.joins(:explainer_items).where('explainer_items.project_media_id = ?', self.id).map(&:title).join("\n")
     titles.blank? ? nil : titles
   end
+
+  def language
+    self.get_dynamic_annotation('language')&.get_field('language')&.send(:to_s)
+  end
+
+  def language_code
+    self.get_dynamic_annotation('language')&.get_field_value('language')
+  end
 end
diff --git a/lib/tasks/data/text_similarity_data.rake b/lib/tasks/data/text_similarity_data.rake
new file mode 100644
index 000000000..c23022baf
--- /dev/null
+++ b/lib/tasks/data/text_similarity_data.rake
@@ -0,0 +1,173 @@
+# bundle exec rake check:data:text_similarity_data['s3_bucket_name=s3_bucket_name&workspace_slugs[]=workspace-slug-1&workspace_slugs[]=workspace-slug-2&...&workspace_slugs[]=workspace-slug-n']
+
+namespace :check do
+  namespace :data do
+    desc 'Export text similarity data.'
+    task :text_similarity_data, [:query_string] => [:environment] do |_task, args|
+      ActiveRecord::Base.logger = nil
+
+      def channel_name(item)
+        CheckChannels::ChannelCodes.all_channels.find{ |_k, v| v == item.channel.to_h['main'].to_i }.to_a[0] ||
+        CheckChannels::ChannelCodes.all_channels['TIPLINE'].find{ |_k, v| v == item.channel.to_h['main'].to_i }.to_a[0]
+      end
+
+      def origin_name(item)
+        CheckMediaClusterOrigins::OriginCodes.all_origins.find{ |_k, v| v == item.media_cluster_origin.to_i }.to_a[0]
+      end
+
+      def standalone_article(article, type)
+        body_method_mapping = {
+          'explainer' => :description,
+          'fact-check' => :summary
+        }
+        claim = nil
+        claim = article.claim_description.description if type == 'fact-check'
+        {
+          id: nil,
+          team_id: nil,
+          team_slug: nil,
+          media_id: nil,
+          title: nil,
+          description: nil,
+          channel: nil,
+          origin: nil,
+          relationships: [],
+          articles: [{
+            id: article.graphql_id,
+            title: article.title,
+            body: article.send(body_method_mapping[type]),
+            url: article.url,
+            type: type,
+            claim: claim,
+            created_at: article.created_at,
+            language: article.language
+          }]
+        }
+      end
+
+      # Parse input parameters
+      options = Rack::Utils.parse_nested_query(args[:query_string])
+      s3_bucket_name = options['s3_bucket_name']
+      workspace_slugs = options['workspace_slugs']
+      team_ids = Team.where(slug: workspace_slugs).map(&:id)
+
+      # Structure for the data: Array of next project media objects like:
+      # {
+      #    id: 1,
+      #    title: 'Example',
+      #    ...,
+      #    articles: [...],
+      #    relationships: [...],
+      # }
+      data = [] 
+
+      # Single-query approach (WIP / draft)
+      # query = ProjectMedia
+      #         .joins(:team).where('teams.slug' => workspace_slugs) # Only items from the selected workspaces
+      #         .joins('LEFT JOIN explainer_items ei ON ei.project_media_id = project_medias.id LEFT JOIN explainers e ON e.id = ei.explainer_id') # Includes explainers
+      #         .joins('LEFT JOIN claim_descriptions cd ON cd.project_media_id = project_medias.id LEFT JOIN fact_checks fc ON fc.claim_description_id = cd.id') # Includes fact-checks
+      #         .joins('LEFT JOIN relationships r ON r.source_id = project_medias.id OR r.target_id = project_medias.id') # Includes relationships
+      # query.to_sql
+
+      # ActiveRecord approach
+
+      # Get text items associated with explainers and fact-checks
+      query = ProjectMedia.joins(:media).where('project_medias.team_id' => team_ids, 'medias.type' => 'Claim') # Only text items from the selected workspaces
+      total = query.count
+      i = 0
+      query.find_each do |item|
+        i += 1
+        puts "[#{Time.now}] Exporting item #{i}/#{total}..."
+
+        # Item data
+        object = {
+          id: item.id,
+          team_id: item.team_id,
+          team_slug: item.team.slug,
+          media_id: item.media_id,
+          title: item.title,
+          description: item.description,
+          channel: channel_name(item),
+          origin: origin_name(item),
+          created_at: item.created_at,
+          language: item.language_code
+        }
+
+        # Explainers, if any
+        object[:articles] = []
+        item.explainers.find_each do |explainer|
+          object[:articles] << {
+            id: explainer.graphql_id,
+            title: explainer.title,
+            body: explainer.description,
+            url: explainer.url,
+            claim: nil,
+            type: 'explainer',
+            created_at: explainer.created_at,
+            language: explainer.language
+          }
+        end
+
+        # Fact-check, if any
+        unless item.fact_check.nil?
+          fact_check = item.fact_check
+          object[:articles] << {
+            id: fact_check.graphql_id,
+            title: fact_check.title,
+            body: fact_check.summary,
+            url: fact_check.url,
+            claim: fact_check.claim_description.description,
+            type: 'fact-check',
+            created_at: fact_check.created_at,
+            language: fact_check.language
+          }
+        end
+
+        # Relationships
+        object[:relationships] = []
+        Relationship.where('source_id = ? OR target_id = ?', item.id, item.id).find_each do |relationship|
+          object[:relationships] << {
+            parent_id: relationship.source_id,
+            child_id: relationship.target_id,
+            user_name: relationship.user&.name
+          }.merge(relationship.as_json)
+        end
+
+        data << object
+      end
+
+      # Complete the data with the explainers that are not associated to any item
+      Explainer.joins('LEFT JOIN explainer_items ei ON ei.explainer_id = explainers.id').where('ei.explainer_id IS NULL').where('explainers.team_id' => team_ids).find_each do |explainer|
+        data << standalone_article(explainer, 'explainer')
+      end
+
+      # Complete the data with the fact-checks that are not associated to any item
+      FactCheck.joins(:claim_description).where('claim_descriptions.team_id' => team_ids, 'claim_descriptions.project_media_id' => nil).find_each do |fact_check|
+        data << standalone_article(fact_check, 'fact-check')
+      end
+      FactCheck.joins(claim_description: { project_media: :media }).where('claim_descriptions.team_id' => team_ids, 'medias.type' => 'Blank').find_each do |fact_check|
+        data << standalone_article(fact_check, 'fact-check')
+      end
+
+      # Convert to JSON and upload to S3
+      region = CheckConfig.get('storage_bucket_region') || 'eu-west-1'
+      begin
+        s3_client = Aws::S3::Client.new(region: region)
+      rescue Aws::Sigv4::Errors::MissingCredentialsError
+        puts 'Please provide the AWS credentials.'
+        exit 1
+      end
+      response = s3_client.put_object(
+        bucket: s3_bucket_name,
+        key: "text-similarity-data-export-#{Time.now.strftime('%Y-%m-%d')}.json",
+        body: JSON.pretty_generate(data)
+      )
+      if response.etag
+        puts 'Uploaded to S3 successfully.'
+      else
+        puts 'Error uploading to S3.'
+        exit 1
+      end
+    end
+  end
+end