From 53ec315f0d40844df422b45d4b479526b5d3e38d Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:16:29 -0300 Subject: [PATCH] Fixing Alegre path for tipline image search. Tipline image search, and actually other places of the code, were still hitting a deprecated path of Alegre API, `:type/similarity/search`, while (AFAIU) it should be `/similarity/sync/:type`. This PR fixes it by creating a `get_items_with_similar_media_v2` method that hits the new endpoint for image and audio and the old endpoint for video and text. This is not the ideal and I also noticed another problem, but I added a couple of `#FIXME`'s to the code to handle that later. Reference: CV2-4044. --- app/models/bot/alegre.rb | 2 +- app/models/concerns/alegre_similarity.rb | 2 +- app/models/concerns/alegre_v2.rb | 13 ++++++++++++- app/models/concerns/smooch_search.rb | 5 +++-- app/resources/api/v2/report_resource.rb | 2 +- lib/check_search.rb | 2 +- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/app/models/bot/alegre.rb b/app/models/bot/alegre.rb index cfeb5651d6..9ed845d759 100644 --- a/app/models/bot/alegre.rb +++ b/app/models/bot/alegre.rb @@ -33,7 +33,7 @@ def similar_items_ids_and_scores(team_ids, thresholds = {}) 'UploadedImage' => 'image', }[self.media.type].to_s threshold = [{value: thresholds.dig(media_type.to_sym, :value)}] || Bot::Alegre.get_threshold_for_query(media_type, self, true) - ids_and_scores = Bot::Alegre.get_items_with_similar_media(Bot::Alegre.media_file_url(self), threshold, team_ids, "/#{media_type}/similarity/search/").to_h + ids_and_scores = Bot::Alegre.get_items_with_similar_media_v2(Bot::Alegre.media_file_url(self), threshold, team_ids, media_type).to_h elsif self.is_text? ids_and_scores = {} threads = [] diff --git a/app/models/concerns/alegre_similarity.rb b/app/models/concerns/alegre_similarity.rb index ec7ce94f9d..921cae3963 100644 --- a/app/models/concerns/alegre_similarity.rb +++ b/app/models/concerns/alegre_similarity.rb @@ -39,7 +39,7 @@ def get_items_with_similarity(type, pm, threshold) if type == 'text' self.get_merged_items_with_similar_text(pm, threshold) else - results = self.get_items_with_similar_media(self.media_file_url(pm), threshold, pm.team_id, "/#{type}/similarity/search/").reject{ |id, _score_with_context| pm.id == id } + results = self.get_items_with_similar_media_v2(self.media_file_url(pm), threshold, pm.team_id, type).reject{ |id, _score_with_context| pm.id == id } self.merge_response_with_source_and_target_fields(results, type) end end diff --git a/app/models/concerns/alegre_v2.rb b/app/models/concerns/alegre_v2.rb index df470caf0b..99ac2d145c 100644 --- a/app/models/concerns/alegre_v2.rb +++ b/app/models/concerns/alegre_v2.rb @@ -9,7 +9,11 @@ def host end def sync_path(project_media) - "/similarity/sync/#{get_type(project_media)}" + self.sync_path_for_type(get_type(project_media)) + end + + def sync_path_for_type(type) + "/similarity/sync/#{type}" end def async_path(project_media) @@ -256,5 +260,12 @@ def get_similar_items_v2(project_media, field) def relate_project_media(project_media, field=nil) self.add_relationships(project_media, self.get_similar_items_v2(project_media, field)) unless project_media.is_blank? end + + def get_items_with_similar_media_v2(media_url, threshold, team_ids, type) + alegre_path = ['audio', 'image'].include?(type) ? self.sync_path_for_type(type) : "/#{type}/similarity/search/" + # FIXME: Stop using this method from v1 once all media types are supported by v2 + # FIXME: Alegre crashes if `media_url` was already requested before, this is why I append a hash + self.get_items_with_similar_media("#{media_url}?hash=#{SecureRandom.hex}", threshold, team_ids, alegre_path) + end end end diff --git a/app/models/concerns/smooch_search.rb b/app/models/concerns/smooch_search.rb index 8b1b4c5d65..63d1c3a766 100644 --- a/app/models/concerns/smooch_search.rb +++ b/app/models/concerns/smooch_search.rb @@ -112,7 +112,7 @@ def get_search_results(uid, last_message, team_id, language) type = message['type'] after = self.date_filter(team_id) query = message['text'] - query = message['mediaUrl'] unless type == 'text' + query = CheckS3.rewrite_url(message['mediaUrl']) unless type == 'text' results = self.search_for_similar_published_fact_checks(type, query, [team_id], after, nil, language).select{ |pm| is_a_valid_search_result(pm) } rescue StandardError => e self.handle_search_error(uid, e, language) @@ -161,10 +161,11 @@ def search_for_similar_published_fact_checks_no_cache(type, query, team_ids, aft end else media_url = Twitter::TwitterText::Extractor.extract_urls(query)[0] + Rails.logger.info "[Smooch Bot] Got media_url #{media_url} from query #{query}" return [] if media_url.blank? media_url = self.save_locally_and_return_url(media_url, type, feed_id) threshold = Bot::Alegre.get_threshold_for_query(type, pm)[0][:value] - alegre_results = Bot::Alegre.get_items_with_similar_media(media_url, [{ value: threshold }], team_ids, "/#{type}/similarity/search/") + alegre_results = Bot::Alegre.get_items_with_similar_media_v2(media_url, [{ value: threshold }], team_ids, type) results = self.parse_search_results_from_alegre(alegre_results, after, feed_id, team_ids) Rails.logger.info "[Smooch Bot] Media similarity search got #{results.count} results while looking for '#{query}' after date #{after.inspect} for teams #{team_ids}" end diff --git a/app/resources/api/v2/report_resource.rb b/app/resources/api/v2/report_resource.rb index 472fb9ed1e..309de6253a 100644 --- a/app/resources/api/v2/report_resource.rb +++ b/app/resources/api/v2/report_resource.rb @@ -100,7 +100,7 @@ def self.apply_media_similarity_filter(organization_ids, threshold, media_path, unless media.blank? media[0].rewind CheckS3.write(media_path, media[0].content_type.gsub(/^video/, 'application'), media[0].read) - ids_and_scores = Bot::Alegre.get_items_with_similar_media(CheckS3.public_url(media_path), [{ value: threshold }], organization_ids, "/#{media_type}/similarity/search/") + ids_and_scores = Bot::Alegre.get_items_with_similar_media_v2(CheckS3.public_url(media_path), [{ value: threshold }], organization_ids, media_type) RequestStore.store[:scores] = ids_and_scores # Store the scores so we can return them ids = ids_and_scores.keys.uniq || [0] CheckS3.delete(media_path) diff --git a/lib/check_search.rb b/lib/check_search.rb index 0b4937745e..3698af3071 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -262,7 +262,7 @@ def alegre_file_similar_items file_path = "check_search/#{hash}" end threshold = Bot::Alegre.get_threshold_for_query(@options['file_type'], ProjectMedia.new(team_id: Team.current.id))[0][:value] - results = Bot::Alegre.get_items_with_similar_media(CheckS3.public_url(file_path), [{ value: threshold }], @options['team_id'].first, "/#{@options['file_type']}/similarity/search/") + results = Bot::Alegre.get_items_with_similar_media_v2(CheckS3.public_url(file_path), [{ value: threshold }], @options['team_id'].first, @options['file_type']) results.blank? ? [0] : results.keys end