meedan · DGaffney · Oct 24, 2024 · Aug 27, 2024 · Sep 4, 2024 · Sep 11, 2024
@@ -33,8 +33,6 @@ def enabled?
   end
 
   def update_keywords(language, keywords, keyword, operation, doc_id, context)
-    alegre_operation = nil
-    alegre_params = nil
     common_alegre_params = {
       doc_id: doc_id,
       context: {
@@ -44,15 +42,11 @@ def update_keywords(language, keywords, keyword, operation, doc_id, context)
     }
     if operation == 'add' && !keywords.include?(keyword)
       keywords << keyword
-      alegre_operation = 'post'
-      alegre_params = common_alegre_params.merge({ text: keyword, models: ALEGRE_MODELS_AND_THRESHOLDS.keys })
+      Bot::Alegre.index_sync_with_params(common_alegre_params.merge({ text: keyword, models: ALEGRE_MODELS_AND_THRESHOLDS.keys }), "text")
     elsif operation == 'remove'
       keywords -= [keyword]
-      alegre_operation = 'delete'
-      alegre_params = common_alegre_params.merge({ quiet: true })
+      Bot::Alegre.request_delete_from_raw(common_alegre_params.merge({ quiet: true }), "text")
     end
-    # FIXME: Add error handling and better logging
-    Bot::Alegre.request(alegre_operation, '/text/similarity/', alegre_params) if alegre_operation && alegre_params
     keywords
   end
 
@@ -91,19 +85,19 @@ def self.alegre_matches_from_message(message, language, context, alegre_result_k
           language: language,
         }.merge(context)
       }
-      response = Bot::Alegre.request('post', '/text/similarity/search/', params)
+      response = Bot::Alegre.query_sync_with_params(params, "text")
 
       # One approach would be to take the option that has the most matches
       # Unfortunately this approach is influenced by the number of keywords per option
       # So, we are not using this approach right now
       # Get the `alegre_result_key` of all results returned
-      # option_counts = response['result'].to_a.map{|o| o.dig('_source', 'context', alegre_result_key)}
+      # option_counts = response['result'].to_a.map{|o| o.dig('context', alegre_result_key)}
       # Count how many of each alegre_result_key we have and sort (high to low)
       # ranked_options = option_counts.group_by(&:itself).transform_values(&:count).sort_by{|_k,v| v}.reverse()
 
       # Second approach is to sort the results from best to worst
-      sorted_options = response['result'].to_a.sort_by{ |result| result['_score'] }.reverse
-      ranked_options = sorted_options.map{ |o| { 'key' => o.dig('_source', 'context', alegre_result_key), 'score' => o['_score'] } }
+      sorted_options = response['result'].to_a.sort_by{ |result| result['score'] }.reverse
+      ranked_options = sorted_options.map{ |o| { 'key' => o.dig('context', alegre_result_key), 'score' => o['score'] } }
       matches = ranked_options
 
       # In all cases log for analysis

@@ -41,7 +41,7 @@ def similar_items_ids_and_scores(team_ids, thresholds = {})
         ALL_TEXT_SIMILARITY_FIELDS.each do |field|
           text = self.send(field)
           next if text.blank?
-          threads << Thread.new { ids_and_scores.merge!(Bot::Alegre.get_similar_texts(team_ids, text, Bot::Alegre::ALL_TEXT_SIMILARITY_FIELDS, thresholds[:text]).to_h) }
+          threads << Thread.new { ids_and_scores.merge!(Bot::Alegre.get_items_from_similar_text(team_ids, text, Bot::Alegre::ALL_TEXT_SIMILARITY_FIELDS, thresholds[:text]).to_h) }
         end
         threads.map(&:join)
       end
@@ -155,10 +155,8 @@ def self.run(body)
         if ['audio', 'image', 'video'].include?(self.get_pm_type(pm))
           self.relate_project_media_async(pm)
         else
-          Bot::Alegre.send_to_media_similarity_index(pm)
-          Bot::Alegre.send_field_to_similarity_index(pm, 'original_title')
-          Bot::Alegre.send_field_to_similarity_index(pm, 'original_description')
-          Bot::Alegre.relate_project_media_to_similar_items(pm)
+          self.relate_project_media_async(pm, 'original_title')
+          self.relate_project_media_async(pm, 'original_description')
         end
         self.get_extracted_text(pm)
         self.get_flags(pm)
@@ -206,7 +204,7 @@ def self.get_items_from_similar_text(team_id, text, fields = nil, threshold = ni
     threshold ||= self.get_threshold_for_query('text', nil, true)
     models ||= [self.matching_model_to_use(team_ids)].flatten
     Hash[self.get_similar_items_from_api(
-      '/text/similarity/search/',
+      'text',
       self.similar_texts_from_api_conditions(text, models, fuzzy, team_ids, fields, threshold),
       threshold
     ).collect{|k,v| [k, v.merge(model: v[:model]||Bot::Alegre.default_matching_model)]}]
@@ -716,8 +714,4 @@ def self.is_text_too_short?(pm, length_threshold)
     is_short
   end
 
-  class <<self
-    alias_method :get_similar_texts, :get_items_from_similar_text
-  end
-
 end
@@ -125,18 +125,18 @@ def send_to_text_similarity_index_package(pm, field, text, doc_id)
         doc_id: doc_id,
         text: text,
         models: models,
-        context: self.get_context(pm, field)
+        context: self.get_context(pm, field),
+        requires_callback: true
       }
       params[:language] = language if !language.nil?
       params
     end
 
     def send_to_text_similarity_index(pm, field, text, doc_id)
       if !text.blank? && Bot::Alegre::BAD_TITLE_REGEX !~ text
-        self.request(
-          'post',
-          '/text/similarity/',
-          self.send_to_text_similarity_index_package(pm, field, text, doc_id)
+        self.query_sync_with_params(
+          self.send_to_text_similarity_index_package(pm, field, text, doc_id),
+          "text"
         )
       end
     end
@@ -207,10 +207,10 @@ def get_merged_similar_items(pm, threshold, fields, value, team_ids = [pm&.team_
       es_matches
     end
 
-    def get_similar_items_from_api(path, conditions, _threshold = {})
-      Rails.logger.error("[Alegre Bot] Sending request to alegre : #{path} , #{conditions.to_json}")
+    def get_similar_items_from_api(type, conditions, _threshold = {})
+      Rails.logger.error("[Alegre Bot] Sending request to alegre : #{type} , #{conditions.to_json}")
       response = {}
-      result = self.request('post', path, conditions)&.dig('result')
+      result = self.query_sync_with_params(conditions, type)&.dig('result')
       project_medias = result.collect{ |r| self.extract_project_medias_from_context(r) } if !result.nil? && result.is_a?(Array)
       project_medias.each do |request_response|
         request_response.each do |pmid, score_with_context|

@@ -1,7 +1,7 @@
 require 'active_support/concern'
 class AlegreTimeoutError < StandardError; end
 class TemporaryProjectMedia
-  attr_accessor :team_id, :id, :url, :type
+  attr_accessor :team_id, :id, :url, :text, :type, :field
   def media
     media_type_map = {
       "claim" => "Claim",
@@ -36,6 +36,10 @@ def is_video?
   def is_audio?
     self.type == "audio"
   end
+
+  def is_uploaded_media?
+    self.is_image? || self.is_audio? || self.is_video?
+  end
 end
 
 module AlegreV2
@@ -55,11 +59,18 @@ def sync_path_for_type(type)
     end
 
     def async_path(project_media)
-      "/similarity/async/#{get_type(project_media)}"
+      self.async_path_for_type(get_type(project_media))
+    end
+
+    def async_path_for_type(type)
+      "/similarity/async/#{type}"
     end
 
     def delete_path(project_media)
-      type = get_type(project_media)
+      self.delete_path_for_type(get_type(project_media))
+    end
+
+    def delete_path_for_type(type)
       "/#{type}/similarity/"
     end
 
@@ -122,6 +133,10 @@ def request(method, path, params, retries=3)
       end
     end
 
+    def request_delete_from_raw(params, type)
+      request("delete", delete_path_for_type(type), params)
+    end
+
     def request_delete(data, project_media)
       request("delete", delete_path(project_media), data)
     end
@@ -148,28 +163,32 @@ def get_type(project_media)
       type
     end
 
+    def content_hash_for_value(value)
+      value.nil? ? nil : Digest::MD5.hexdigest(value)
+    end
+
     def content_hash(project_media, field)
       if Bot::Alegre::ALL_TEXT_SIMILARITY_FIELDS.include?(field)
-        Digest::MD5.hexdigest(project_media.send(field))
+        content_hash_for_value(project_media.send(field))
+      elsif project_media.is_link?
+        return content_hash_for_value(project_media.media.url)
+      elsif project_media.is_a?(TemporaryProjectMedia)
+        return Rails.cache.read("url_sha:#{project_media.url}")
+      elsif project_media.is_uploaded_media?
+        return project_media.media.file.filename.split(".").first
       else
-        if project_media.is_link?
-          return Digest::MD5.hexdigest(project_media.media.url)
-        elsif project_media.is_a?(TemporaryProjectMedia)
-          return Rails.cache.read("url_sha:#{project_media.url}")
-        elsif !project_media.is_text?
-          return project_media.media.file.filename.split(".").first
-        else
-          return Digest::MD5.hexdigest(project_media.send(field).to_s)
-        end
+        return content_hash_for_value(project_media.send(field).to_s)
       end
     end
 
     def generic_package(project_media, field)
-      {
-        content_hash: content_hash(project_media, field),
+      content_hash_value = content_hash(project_media, field)
+      params = {
         doc_id: item_doc_id(project_media, field),
         context: get_context(project_media, field)
       }
+      params[:content_hash] = content_hash_value if !content_hash_value.nil?
+      params
     end
 
     def delete_package(project_media, field, params={}, quiet=false)
@@ -267,6 +286,22 @@ def store_package_text(project_media, field, params)
       generic_package_text(project_media, field, params)
     end
 
+    def index_async_with_params(params, type, suppress_search_response=true)
+      request("post", async_path_for_type(type), params.merge(suppress_search_response: suppress_search_response))
+    end
+
+    def index_sync_with_params(params, type)
+      query_sync_with_params(params, type)
+    end
+
+    def query_sync_with_params(params, type)
+      request("post", sync_path_for_type(type), params)
+    end
+
+    def query_async_with_params(params, type)
+      request("post", async_path_for_type(type), params)
+    end
+
     def get_sync(project_media, field=nil, params={})
       request_sync(
         store_package(project_media, field, params),
@@ -286,6 +321,10 @@ def delete(project_media, field=nil, params={})
         delete_package(project_media, field, params),
         project_media
       )
+    rescue StandardError => e
+      error = Bot::Alegre::Error.new(e)
+      Rails.logger.error("[Alegre Bot] Exception on Delete for ProjectMedia ##{project_media.id}: #{error.class} - #{error.message}")
+      CheckSentry.notify(error, bot: "alegre", project_media: project_media, params: params, field: field)
     end
 
     def get_per_model_threshold(project_media, threshold)
@@ -298,7 +337,7 @@ def get_per_model_threshold(project_media, threshold)
     end
 
     def isolate_relevant_context(project_media, result)
-      result["context"].select{|x| ([x["team_id"]].flatten & [project_media.team_id].flatten).count > 0 && !x["temporary_media"]}.first
+      (result["contexts"]||result["context"]).select{|x| ([x["team_id"]].flatten & [project_media.team_id].flatten).count > 0 && !x["temporary_media"]}.first
     end
 
     def get_target_field(project_media, field)
@@ -485,25 +524,27 @@ def wait_for_results(project_media, args)
     end
 
     def get_items_with_similar_media_v2(args={})
+      text = args[:text]
+      field = args[:field]
       media_url = args[:media_url]
       project_media = args[:project_media]
       threshold = args[:threshold]
       team_ids = args[:team_ids]
       type = args[:type]
-      if ['audio', 'image', 'video'].include?(type)
-        if project_media.nil?
-          project_media = TemporaryProjectMedia.new
-          project_media.url = media_url
-          project_media.id = Digest::MD5.hexdigest(project_media.url).to_i(16)
-          project_media.team_id = team_ids
-          project_media.type = type
-        end
-        get_similar_items_v2_async(project_media, nil, threshold)
-        wait_for_results(project_media, args)
-        response = get_similar_items_v2_callback(project_media, nil)
-        delete(project_media, nil) if project_media.is_a?(TemporaryProjectMedia)
-        return response
+      if project_media.nil?
+        project_media = TemporaryProjectMedia.new
+        project_media.text = text
+        project_media.field = field
+        project_media.url = media_url
+        project_media.id = Digest::MD5.hexdigest(project_media.url).to_i(16)
+        project_media.team_id = team_ids
+        project_media.type = type
       end
+      get_similar_items_v2_async(project_media, nil, threshold)
+      wait_for_results(project_media, args)
+      response = get_similar_items_v2_callback(project_media, nil)
+      delete(project_media, nil) if project_media.is_a?(TemporaryProjectMedia)
+      return response
     end
 
     def process_alegre_callback(params)
@@ -512,9 +553,11 @@ def process_alegre_callback(params)
       should_relate = true
       if project_media.nil?
         project_media = TemporaryProjectMedia.new
+        project_media.text = params.dig('data', 'item', 'raw', 'text')
         project_media.url = params.dig('data', 'item', 'raw', 'url')
         project_media.id = params.dig('data', 'item', 'raw', 'context', 'project_media_id')
         project_media.team_id = params.dig('data', 'item', 'raw', 'context', 'team_id')
+        project_media.field = params.dig('data', 'item', 'raw', 'context', 'field')
         project_media.type = params['model_type']
         should_relate = false
       end

@@ -11,10 +11,6 @@ def is_link?
     self.media.type == "Link"
   end
 
-  def is_uploaded_image?
-    self.media.type == "UploadedImage"
-  end
-
   def is_blank?
     self.media.type == "Blank"
   end
@@ -28,7 +24,11 @@ def is_audio?
   end
 
   def is_image?
-    self.is_uploaded_image?
+    self.media.type == "UploadedImage"
+  end
+
+  def is_uploaded_media?
+    self.is_image? || self.is_audio? || self.is_video?
   end
 
   def is_text?

@@ -73,24 +73,26 @@ def self.update_paragraphs_in_alegre(id, previous_paragraphs_count, timestamp)
 
     # Index title
     params = {
+      content_hash: Bot::Alegre.content_hash_for_value(explainer.title),
       doc_id: Digest::MD5.hexdigest(['explainer', explainer.id, 'title'].join(':')),
+      context: base_context.merge({ field: 'title' }),
       text: explainer.title,
       models: ALEGRE_MODELS_AND_THRESHOLDS.keys,
-      context: base_context.merge({ field: 'title' })
     }
-    Bot::Alegre.request('post', '/text/similarity/', params)
+    Bot::Alegre.index_async_with_params(params, "text")
 
     # Index paragraphs
     count = 0
     explainer.description.to_s.gsub(/\r\n?/, "\n").split(/\n+/).reject{ |paragraph| paragraph.strip.blank? }.each do |paragraph|
       count += 1
       params = {
+        content_hash: Bot::Alegre.content_hash_for_value(paragraph.strip),
         doc_id: Digest::MD5.hexdigest(['explainer', explainer.id, 'paragraph', count].join(':')),
+        context: base_context.merge({ paragraph: count }),
         text: paragraph.strip,
         models: ALEGRE_MODELS_AND_THRESHOLDS.keys,
-        context: base_context.merge({ paragraph: count })
       }
-      Bot::Alegre.request('post', '/text/similarity/', params)
+      Bot::Alegre.index_async_with_params(params, "text")
     end
 
     # Remove paragraphs that don't exist anymore (we delete after updating in order to avoid race conditions)
@@ -101,7 +103,7 @@ def self.update_paragraphs_in_alegre(id, previous_paragraphs_count, timestamp)
         quiet: true,
         context: base_context.merge({ paragraph: count })
       }
-      Bot::Alegre.request('delete', '/text/similarity/', params)
+      Bot::Alegre.request_delete_from_raw(params, "text")
     end
   end
 
@@ -116,9 +118,9 @@ def self.search_by_similarity(text, language, team_id)
         language: language
       }
     }
-    response = Bot::Alegre.request('post', '/text/similarity/search/', params)
+    response = Bot::Alegre.query_sync_with_params(params, "text")
     results = response['result'].to_a.sort_by{ |result| result['_score'] }
-    explainer_ids = results.collect{ |result| result.dig('_source', 'context', 'explainer_id').to_i }.uniq.first(3)
+    explainer_ids = results.collect{ |result| result.dig('context', 'explainer_id').to_i }.uniq.first(3)
     explainer_ids.empty? ? Explainer.none : Explainer.where(team_id: team_id, id: explainer_ids)
   end