From be2e5eb8b52186819bfb68071b3710d43e8292a2 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:15:21 +0200 Subject: [PATCH 01/17] chore(search): upgrade client to ES 8 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index df83dfb6ee..0bd6dda6f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,8 +22,8 @@ django==1.11.* djangorestframework @ https://github.com/encode/django-rest-framework/archive/3.11.1.tar.gz djangorestframework_simplejwt==3.3.0 PyJWT==1.7.1 # pinned b/c current version 2.0.0 breaks simplejwt. waiting for 2.0.1 -elasticsearch==7.9.1 -elasticsearch_dsl==7.3.0 +elasticsearch==8.8.2 +git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl geojson==2.5.0 geopy==2.3.0 gevent==20.12.0; sys_platform != 'darwin' From ebd8ce6e82cb18c6dcaf9feabb8c89e0d6962647 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:16:13 +0200 Subject: [PATCH 02/17] chore(search): remove pip install from cronjob since it's now in requirements.txt --- .../templates/cronjob/reindex-elasticsearch.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml index 640fd4c4ab..58a83e2126 100644 --- a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml @@ -64,7 +64,7 @@ spec: command: ["bash"] args: [ "-c", - "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy elasticsearch==8.8.2 git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" + "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" ] restartPolicy: Never volumes: From dffd1c2864bfbdab1d60cc0e04216cd9f4ae9479 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:17:13 +0200 Subject: [PATCH 03/17] chore(search): client should explicitly query ES 8 --- static/js/sefaria/search.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/sefaria/search.js b/static/js/sefaria/search.js index 15716e67d9..6506574bed 100644 --- a/static/js/sefaria/search.js +++ b/static/js/sefaria/search.js @@ -39,7 +39,7 @@ class Search { } wrapper.addQuery($.ajax({ - url: `${Sefaria.apiHost}/api/search-wrapper`, + url: `${Sefaria.apiHost}/api/search-wrapper/es8`, type: 'POST', data: jsonData, contentType: "application/json; charset=utf-8", From 4ae2c10c52b7072f28237fb58523bedd921b422d Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:19:09 +0200 Subject: [PATCH 04/17] chore(search): remove compatibility code in JS --- static/js/SearchResultList.jsx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/static/js/SearchResultList.jsx b/static/js/SearchResultList.jsx index b09e91b170..51ed5bf099 100644 --- a/static/js/SearchResultList.jsx +++ b/static/js/SearchResultList.jsx @@ -100,11 +100,7 @@ class SearchTotal { function createSearchTotal(total) { - /** - * this function ensures backwards compatibility between the way elasticsearch formats the total pre-v8 and post-v8 - */ - const totalObj = typeof(total) === 'number' ? {value: total} : {value: total.value, relation: total.relation}; - return new SearchTotal(totalObj) + return new SearchTotal(total); } From c05388c451c4dfbef78c8b817d420879c2fa9cde Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:21:06 +0200 Subject: [PATCH 05/17] fix(search): remove compatibility code --- reader/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reader/views.py b/reader/views.py index 8701e0dfad..83b51fb92c 100644 --- a/reader/views.py +++ b/reader/views.py @@ -4219,7 +4219,7 @@ def search_wrapper_api(request, es6_compat=False): search_obj = get_query_obj(search_obj=search_obj, **j) response = search_obj.execute() if response.success(): - response_json = getattr(response.to_dict(), 'body', response.to_dict()) + response_json = response.to_dict().body if es6_compat and isinstance(response_json['hits']['total'], dict): response_json['hits']['total'] = response_json['hits']['total']['value'] return jsonResponse(response_json, callback=request.GET.get("callback", None)) From 387d1e68013d50bd09a01b4fcd0cc92bca1b3b4e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 6 Dec 2023 09:51:58 +0200 Subject: [PATCH 06/17] helm(search): switch to ES 8 SEARCH_HOST --- build/ci/production-values.yaml | 4 +- .../cronjob/reindex-elasticsearch-es6.yaml | 77 -- .../cronjob/reindex-elasticsearch.yaml | 2 +- scripts/reindex_elasticsearch_cronjob_ES6.py | 49 - sefaria/search_ES6.py | 844 ------------------ 5 files changed, 2 insertions(+), 974 deletions(-) delete mode 100644 helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml delete mode 100644 scripts/reindex_elasticsearch_cronjob_ES6.py delete mode 100644 sefaria/search_ES6.py diff --git a/build/ci/production-values.yaml b/build/ci/production-values.yaml index bbf9b243d0..dbeda4badf 100644 --- a/build/ci/production-values.yaml +++ b/build/ci/production-values.yaml @@ -141,7 +141,7 @@ nginx: containerImage: imageRegistry: tag: - SEARCH_HOST: elasticsearch-data + SEARCH_HOST: elasticsearch-es-http.elasticsearch.svc disableScraping: false replicaCount: 2 resources: @@ -179,8 +179,6 @@ cronJobs: enabled: true reindexElasticSearch: enabled: true - SEARCH_HOST_ES6: "elasticsearch-data" - SEARCH_HOST_ES8: "elasticsearch-es-http.elasticsearch.svc" topicsIndexing: enabled: true trello: diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml deleted file mode 100644 index 9345a644fb..0000000000 --- a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml +++ /dev/null @@ -1,77 +0,0 @@ -{{- if .Values.cronJobs.reindexElasticSearch.enabled }} ---- -apiVersion: batch/v1 -kind: CronJob -metadata: - name: {{ .Values.deployEnv }}-reindex-elastic-search-es6 - labels: - {{- include "sefaria.labels" . | nindent 4 }} -spec: - schedule: "20 13 * * 0" - jobTemplate: - spec: - backoffLimit: 1 - template: - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - mongo - topologyKey: kubernetes.io.hostname - containers: - - name: reindex-elastic-search-es6 - image: "{{ .Values.web.containerImage.imageRegistry }}:{{ .Values.web.containerImage.tag }}" - resources: - limits: - memory: 9Gi - requests: - memory: 7Gi - env: - - name: SEARCH_HOST - value: "{{ .Values.cronJobs.reindexElasticSearch.SEARCH_HOST_ES6 }}" - - name: REDIS_HOST - value: "redis-{{ .Values.deployEnv }}" - - name: NODEJS_HOST - value: "node-{{ .Values.deployEnv }}-{{ .Release.Revision }}" - - name: VARNISH_HOST - value: "varnish-{{ .Values.deployEnv }}-{{ .Release.Revision }}" - - name: SLACK_URL - valueFrom: - secretKeyRef: - name: {{ template "sefaria.secrets.slackWebhook" . }} - key: slack-webhook - envFrom: - - secretRef: - name: {{ .Values.secrets.localSettings.ref }} - optional: true - - configMapRef: - name: local-settings-{{ .Values.deployEnv }} - - secretRef: - name: local-settings-secrets-{{ .Values.deployEnv }} - optional: true - volumeMounts: - - mountPath: /app/sefaria/local_settings.py - name: local-settings - subPath: local_settings.py - readOnly: true - command: ["bash"] - args: [ - "-c", - "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob_ES6.py" - ] - restartPolicy: Never - volumes: - - name: local-settings - configMap: - name: local-settings-file-{{ .Values.deployEnv }} - items: - - key: local_settings.py - path: local_settings.py - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 2 -{{- end }} diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml index 58a83e2126..07d74ae80a 100644 --- a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml @@ -33,7 +33,7 @@ spec: memory: 7Gi env: - name: SEARCH_HOST - value: "{{ .Values.cronJobs.reindexElasticSearch.SEARCH_HOST_ES8 }}" + value: "{{ .Values.nginx.SEARCH_HOST }}" - name: REDIS_HOST value: "redis-{{ .Values.deployEnv }}" - name: NODEJS_HOST diff --git a/scripts/reindex_elasticsearch_cronjob_ES6.py b/scripts/reindex_elasticsearch_cronjob_ES6.py deleted file mode 100644 index 1a3f181eb2..0000000000 --- a/scripts/reindex_elasticsearch_cronjob_ES6.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -This file is meant to be temporary while we are migrating to elasticsearch 8 -""" -from datetime import datetime -import requests -import traceback -import os -import django -django.setup() -from sefaria.model import * -from sefaria.search_ES6 import index_all -from sefaria.local_settings import SEFARIA_BOT_API_KEY -from sefaria.pagesheetrank import update_pagesheetrank - -""" -Source sheets added after last_sheet_timestamp will be missing from the index process. We want to manually index all -source sheets created after this. Depending on the database being used to index the timestamp will be different. If -running against a production database, last_sheet_timestamp will be the time this script began running. Otherwise, this -value will need to be set to the time at which the last mongo dump was created (assuming the database is using the most -up-to-date mongo dump). -""" -# last_sheet_timestamp = datetime.fromtimestamp(os.path.getmtime("/var/data/sefaria_public/dump/sefaria")).isoformat() -try: - last_sheet_timestamp = datetime.now().isoformat() - update_pagesheetrank() - index_all() - r = requests.post("https://www.sefaria.org/admin/index-sheets-by-timestamp", data={"timestamp": last_sheet_timestamp, "apikey": SEFARIA_BOT_API_KEY}) - if "error" in r.text: - raise Exception("Error when calling admin/index-sheets-by-timestamp API: " + r.text) - else: - print("SUCCESS!", r.text) -except Exception as e: - tb_str = traceback.format_exc() - print("Caught exception") - post_object = { - "icon_emoji": ":facepalm:", - "username": "Reindex ElasticSearch", - "channel": "#engineering-discuss", - "attachments": [ - { - "fallback": tb_str, - "color": "#a30200", - "pretext": "Cronjob Error", - "text": tb_str - } - ] - } - requests.post(os.environ['SLACK_URL'], json=post_object) - raise e diff --git a/sefaria/search_ES6.py b/sefaria/search_ES6.py deleted file mode 100644 index 812610eb07..0000000000 --- a/sefaria/search_ES6.py +++ /dev/null @@ -1,844 +0,0 @@ -# -*- coding: utf-8 -*- -""" -This file is meant to be temporary while we are migrating to elasticsearch 8 - -search.py - full-text search for Sefaria using ElasticSearch - -Writes to MongoDB Collection: index_queue -""" -import os -from datetime import datetime, timedelta -import re -import bleach -import pymongo - -# To allow these files to be run directly from command line (w/o Django shell) -os.environ['DJANGO_SETTINGS_MODULE'] = "settings" - -import structlog -import logging -from logging import NullHandler -from collections import defaultdict -import time as pytime -logger = structlog.get_logger(__name__) - -from elasticsearch import Elasticsearch -from elasticsearch.client import IndicesClient -from elasticsearch.helpers import bulk -from elasticsearch.exceptions import NotFoundError -from sefaria.model import * -from sefaria.model.text import AbstractIndex, AbstractTextRecord -from sefaria.model.user_profile import user_link, public_user_data -from sefaria.model.collection import CollectionSet -from sefaria.system.database import db -from sefaria.system.exceptions import InputError -from sefaria.utils.util import strip_tags -from .settings import SEARCH_URL, SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET, STATICFILES_DIRS -from sefaria.site.site_settings import SITE_SETTINGS -from sefaria.utils.hebrew import strip_cantillation -import sefaria.model.queue as qu - -es_client = Elasticsearch(SEARCH_URL) -index_client = IndicesClient(es_client) - -tracer = structlog.get_logger(__name__) -tracer.setLevel(logging.CRITICAL) -#tracer.addHandler(logging.FileHandler('/tmp/es_trace.log')) -tracer.addHandler(NullHandler()) - -doc_count = 0 - - -def delete_text(oref, version, lang): - try: - curr_index = get_new_and_current_index_names('text')['current'] - - id = make_text_doc_id(oref.normal(), version, lang) - es_client.delete(index=curr_index, doc_type='text', id=id) - except Exception as e: - logger.error("ERROR deleting {} / {} / {} : {}".format(oref.normal(), version, lang, e)) - - -def delete_version(index, version, lang): - assert isinstance(index, AbstractIndex) - - refs = [] - - if SITE_SETTINGS["TORAH_SPECIFIC"]: - all_gemara_indexes = library.get_indexes_in_category("Bavli") - davidson_indexes = all_gemara_indexes[:all_gemara_indexes.index("Horayot") + 1] - if Ref(index.title).is_bavli() and index.title not in davidson_indexes: - refs += index.all_section_refs() - - refs += index.all_segment_refs() - - for ref in refs: - delete_text(ref, version, lang) - - -def delete_sheet(index_name, id): - try: - es_client.delete(index=index_name, doc_type='sheet', id=id) - except Exception as e: - logger.error("ERROR deleting sheet {}".format(id)) - - -def make_text_doc_id(ref, version, lang): - """ - Returns a doc id string for indexing based on ref, versiona and lang. - - [HACK] Since Elasticsearch chokes on non-ascii ids, hebrew titles are converted - into a number using unicode_number. This mapping should be unique, but actually isn't. - (any tips welcome) - """ - if not version.isascii(): - version = str(unicode_number(version)) - - id = "%s (%s [%s])" % (ref, version, lang) - return id - - -def unicode_number(u): - """ - Returns a number corresponding to the sum value - of each unicode character in u - """ - n = 0 - for i in range(len(u)): - n += ord(u[i]) - return n - - -def index_sheet(index_name, id): - """ - Index source sheet with 'id'. - """ - - sheet = db.sheets.find_one({"id": id}) - if not sheet: return False - - pud = public_user_data(sheet["owner"]) - tag_terms_simple = make_sheet_tags(sheet) - tags = [t["en"] for t in tag_terms_simple] - topics = [] - for t in sheet.get('topics', []): - topic_obj = Topic.init(t['slug']) - if not topic_obj: - continue - topics += [topic_obj] - collections = CollectionSet({"sheets": id, "listed": True}) - collection_names = [c.name for c in collections] - try: - doc = { - "title": strip_tags(sheet["title"]), - "content": make_sheet_text(sheet, pud), - "owner_id": sheet["owner"], - "owner_name": pud["name"], - "owner_image": pud["imageUrl"], - "profile_url": pud["profileUrl"], - "version": "Source Sheet by " + user_link(sheet["owner"]), - "tags": tags, - "topic_slugs": [topic_obj.slug for topic_obj in topics], - "topics_en": [topic_obj.get_primary_title('en') for topic_obj in topics], - "topics_he": [topic_obj.get_primary_title('he') for topic_obj in topics], - "sheetId": id, - "summary": sheet.get("summary", None), - "collections": collection_names, - "datePublished": sheet.get("datePublished", None), - "dateCreated": sheet.get("dateCreated", None), - "dateModified": sheet.get("dateModified", None), - "views": sheet.get("views", 0) - } - es_client.create(index=index_name, doc_type='sheet', id=id, body=doc) - global doc_count - doc_count += 1 - return True - except Exception as e: - print("Error indexing sheet %d" % id) - print(e) - return False - - -def make_sheet_tags(sheet): - def get_primary_title(lang, titles): - return [t for t in titles if t.get("primary") and t.get("lang", "") == lang][0]["text"] - - tags = sheet.get('tags', []) - tag_terms = [(Term().load({'name': t}) or Term().load_by_title(t)) for t in tags] - tag_terms_simple = [ - { - 'en': tags[iterm], # save as en even if it's Hebrew - 'he': '' - } if term is None else - { - 'en': get_primary_title('en', term.titles), - 'he': get_primary_title('he', term.titles) - } for iterm, term in enumerate(tag_terms) - ] - #tags_en, tags_he = zip(*tag_terms_simple.values()) - return tag_terms_simple - -def make_sheet_text(sheet, pud): - """ - Returns a plain text representation of the content of sheet. - :param sheet: The sheet record - :param pud: Public User Database record for the author - """ - text = sheet["title"] + "\n{}".format(sheet.get("summary", '')) - if pud.get("name"): - text += "\nBy: " + pud["name"] - text += "\n" - if sheet.get("tags"): - text += " [" + ", ".join(sheet["tags"]) + "]\n" - for s in sheet["sources"]: - text += source_text(s) + " " - - text = bleach.clean(text, strip=True, tags=()) - - return text - - -def source_text(source): - """ - Recursive function to translate a source dictionary into text. - """ - str_fields = ["customTitle", "ref", "comment", "outsideText"] - dict_fields = ["text", "outsideBiText"] - content = [source.get(field, "") for field in str_fields] - content += [val for field in dict_fields for val in source.get(field, {}).values()] - text = " ".join([strip_tags(c) for c in content]) - - if "subsources" in source: - for s in source["subsources"]: - text += source_text(s) - - return text - - -def get_exact_english_analyzer(): - return { - "tokenizer": "standard", - "char_filter": [ - "icu_normalizer", - ], - "filter": [ - "standard", - "lowercase", - "icu_folding", - ], - } - - -def get_stemmed_english_analyzer(): - stemmed_english_analyzer = get_exact_english_analyzer() - stemmed_english_analyzer['filter'] += ["my_snow"] - return stemmed_english_analyzer - - -def create_index(index_name, type): - """ - Clears the indexes and creates it fresh with the below settings. - """ - clear_index(index_name) - - settings = { - "index": { - "blocks": { - "read_only_allow_delete": False - }, - "analysis": { - "analyzer": { - "stemmed_english": get_stemmed_english_analyzer(), - "exact_english": get_exact_english_analyzer(), - }, - "filter": { - "my_snow": { - "type": "snowball", - "language": "English" - } - } - } - } - } - print('Creating index {}'.format(index_name)) - index_client.create(index=index_name, body=settings) - - if type == 'text': - put_text_mapping(index_name) - elif type == 'sheet': - put_sheet_mapping(index_name) - - -def put_text_mapping(index_name): - """ - Settings mapping for the text document type. - """ - text_mapping = { - 'properties' : { - 'categories': { - 'type': 'keyword', - }, - "category": { - 'type': 'keyword', - }, - "he_category": { - 'type': 'keyword', - }, - "index_title": { - 'type': 'keyword', - }, - "path": { - 'type': 'keyword', - }, - "he_index_title": { - 'type': 'keyword', - }, - "he_path": { - 'type': 'keyword', - }, - "order": { - 'type': 'keyword', - }, - "pagesheetrank": { - 'type': 'double', - 'index': False - }, - "comp_date": { - 'type': 'integer', - 'index': False - }, - "version_priority": { - 'type': 'integer', - 'index': False - }, - "exact": { - 'type': 'text', - 'analyzer': 'exact_english' - }, - "naive_lemmatizer": { - 'type': 'text', - 'analyzer': 'sefaria-naive-lemmatizer', - 'search_analyzer': 'sefaria-naive-lemmatizer-less-prefixes', - 'fields': { - 'exact': { - 'type': 'text', - 'analyzer': 'exact_english' - } - } - } - } - } - index_client.put_mapping(doc_type='text', body=text_mapping, index=index_name) - - -def put_sheet_mapping(index_name): - """ - Sets mapping for the sheets document type. - """ - sheet_mapping = { - 'properties': { - 'owner_name': { - 'type': 'keyword' - }, - 'tags': { - 'type': 'keyword' - }, - "topics_en": { - "type": "keyword" - }, - "topics_he": { - "type": "keyword" - }, - "topic_slugs": { - "type": "keyword" - }, - 'owner_image': { - 'type': 'keyword' - }, - 'datePublished': { - 'type': 'date' - }, - 'dateCreated': { - 'type': 'date' - }, - 'dateModified': { - 'type': 'date' - }, - 'sheetId': { - 'type': 'integer' - }, - 'collections': { - 'type': 'keyword' - }, - 'title': { - 'type': 'keyword' - }, - 'views': { - 'type': 'integer' - }, - 'summary': { - 'type': 'keyword' - }, - 'content': { - 'type': 'text', - 'analyzer': 'stemmed_english' - }, - 'version': { - 'type': 'keyword' - }, - 'profile_url': { - 'type': 'keyword' - }, - 'owner_id': { - 'type': 'integer' - } - } - } - index_client.put_mapping(doc_type='sheet', body=sheet_mapping, index=index_name) - -def get_search_categories(oref, categories): - toc_tree = library.get_toc_tree() - cats = oref.index.categories - - indexed_categories = categories # the default - - # get the full path of every cat along the way. - # starting w/ the longest, - # check if they're root swapped. - paths = [cats[:i] for i in range(len(cats), 0, -1)] - for path in paths: - cnode = toc_tree.lookup(path) - if getattr(cnode, "searchRoot", None) is not None: - # Use the specified searchRoot, with the rest of the category path appended. - indexed_categories = [cnode.searchRoot] + cats[len(path) - 1:] - break - return indexed_categories - - -class TextIndexer(object): - - @classmethod - def clear_cache(cls): - cls.terms_dict = None - cls.version_priority_map = None - cls._bulk_actions = None - cls.best_time_period = None - - - @classmethod - def create_terms_dict(cls): - cls.terms_dict = {} - ts = TermSet() - for t in ts: - cls.terms_dict[t.name] = t.contents() - - @classmethod - def create_version_priority_map(cls): - toc = library.get_toc() - cls.version_priority_map = {} - - def traverse(mini_toc): - if type(mini_toc) == list: - for t in mini_toc: - traverse(t) - elif "contents" in mini_toc: - for t in mini_toc["contents"]: - traverse(t) - elif "title" in mini_toc and not mini_toc.get("isCollection", False): - title = mini_toc["title"] - try: - r = Ref(title) - except InputError: - print("Failed to parse ref, {}".format(title)) - return - vlist = cls.get_ref_version_list(r) - vpriorities = defaultdict(lambda: 0) - for i, v in enumerate(vlist): - lang = v.language - cls.version_priority_map[(title, v.versionTitle, lang)] = (vpriorities[lang], mini_toc["categories"]) - vpriorities[lang] += 1 - - traverse(toc) - - @staticmethod - def get_ref_version_list(oref, tries=0): - try: - return oref.index.versionSet().array() - except InputError as e: - print(f"InputError: {oref.normal()}") - return [] - except pymongo.errors.AutoReconnect as e: - if tries < 200: - pytime.sleep(5) - return TextIndexer.get_ref_version_list(oref, tries+1) - else: - print("get_ref_version_list -- Tried: {} times. Failed :(".format(tries)) - raise e - - @classmethod - def get_all_versions(cls, tries=0, versions=None, page=0): - versions = versions or [] - try: - version_limit = 10 - temp_versions = [] - first_run = True - while first_run or len(temp_versions) > 0: - temp_versions = VersionSet(limit=version_limit, page=page).array() - versions += temp_versions - page += 1 - first_run = False - return versions - except pymongo.errors.AutoReconnect as e: - if tries < 200: - pytime.sleep(5) - return cls.get_all_versions(tries+1, versions, page) - else: - print("Tried: {} times. Got {} versions".format(tries, len(versions))) - raise e - - @classmethod - def index_all(cls, index_name, debug=False, for_es=True, action=None): - cls.index_name = index_name - cls.create_version_priority_map() - cls.create_terms_dict() - Ref.clear_cache() # try to clear Ref cache to save RAM - - versions = sorted([x for x in cls.get_all_versions() if (x.title, x.versionTitle, x.language) in cls.version_priority_map], key=lambda x: cls.version_priority_map[(x.title, x.versionTitle, x.language)][0]) - versions_by_index = {} - # organizing by index for the merged case. There is no longer a merged case but keeping this logic b/c it seems fine - for v in versions: - key = (v.title, v.language) - if key in versions_by_index: - versions_by_index[key] += [v] - else: - versions_by_index[key] = [v] - print("Beginning index of {} versions.".format(len(versions))) - vcount = 0 - total_versions = len(versions) - versions = None # release RAM - for title, vlist in list(versions_by_index.items()): - cls.curr_index = vlist[0].get_index() if len(vlist) > 0 else None - if for_es: - cls._bulk_actions = [] - try: - cls.best_time_period = cls.curr_index.best_time_period() - except ValueError: - cls.best_time_period = None - for v in vlist: - if v.versionTitle == "Yehoyesh's Yiddish Tanakh Translation [yi]": - print("skipping yiddish. we don't like yiddish") - continue - - cls.index_version(v, action=action) - print("Indexed Version {}/{}".format(vcount, total_versions)) - vcount += 1 - if for_es: - bulk(es_client, cls._bulk_actions, stats_only=True, raise_on_error=False) - - @classmethod - def index_version(cls, version, tries=0, action=None): - if not action: - action = cls._cache_action - try: - version.walk_thru_contents(action, heTref=cls.curr_index.get_title('he'), schema=cls.curr_index.schema, terms_dict=cls.terms_dict) - except pymongo.errors.AutoReconnect as e: - # Adding this because there is a mongo call for dictionary words in walk_thru_contents() - if tries < 200: - pytime.sleep(5) - print("Retrying {}. Try {}".format(version.title, tries)) - cls.index_version(version, tries+1) - else: - print("Tried {} times to get {}. I have failed you...".format(tries, version.title)) - raise e - except StopIteration: - print("Could not find dictionary node in {}".format(version.title)) - - @classmethod - def index_ref(cls, index_name, oref, version_title, lang): - # slower than `cls.index_version` but useful when you don't want the overhead of loading all versions into cache - cls.index_name = index_name - cls.curr_index = oref.index - try: - cls.best_time_period = cls.curr_index.best_time_period() - except ValueError: - cls.best_time_period = None - version_priority = 0 - hebrew_version_title = None - for priority, v in enumerate(cls.get_ref_version_list(oref)): - if v.versionTitle == version_title: - version_priority = priority - hebrew_version_title = getattr(v, 'versionTitleInHebrew', None) - content = TextChunk(oref, lang, vtitle=version_title).ja().flatten_to_string() - categories = cls.curr_index.categories - tref = oref.normal() - doc = cls.make_text_index_document(tref, oref.he_normal(), version_title, lang, version_priority, content, categories, hebrew_version_title) - id = make_text_doc_id(tref, version_title, lang) - es_client.index(index_name, doc, id=id) - - @classmethod - def _cache_action(cls, segment_str, tref, heTref, version): - # Index this document as a whole - vtitle = version.versionTitle - vlang = version.language - hebrew_version_title = getattr(version, 'versionTitleInHebrew', None) - try: - version_priority, categories = cls.version_priority_map[(version.title, vtitle, vlang)] - #TODO include sgement_str in this func - doc = cls.make_text_index_document(tref, heTref, vtitle, vlang, version_priority, segment_str, categories, hebrew_version_title) - # print doc - except Exception as e: - logger.error("Error making index document {} / {} / {} : {}".format(tref, vtitle, vlang, str(e))) - return - - if doc: - try: - cls._bulk_actions += [ - { - "_index": cls.index_name, - "_type": "text", - "_id": make_text_doc_id(tref, vtitle, vlang), - "_source": doc - } - ] - except Exception as e: - logger.error("ERROR indexing {} / {} / {} : {}".format(tref, vtitle, vlang, e)) - - @classmethod - def remove_footnotes(cls, content): - ftnotes = AbstractTextRecord.find_all_itags(content, only_footnotes=True)[1] - if len(ftnotes) == 0: - return content - else: - for sup_tag in ftnotes: - i_tag = sup_tag.next_sibling - content += f" {sup_tag.text} {i_tag.text}" - content = AbstractTextRecord.strip_itags(content) - return content - - @classmethod - def modify_text_in_doc(cls, content): - content = AbstractTextRecord.strip_imgs(content) - content = cls.remove_footnotes(content) - content = strip_cantillation(content, strip_vowels=False).strip() - content = re.sub(r'<[^>]+>', ' ', content) # replace HTML tags with space so that words dont get smushed together - content = re.sub(r'\([^)]+\)', ' ', content) # remove all parens - while " " in content: # make sure there are not many spaces in a row - content = content.replace(" ", " ") - return content - - @classmethod - def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories, hebrew_version_title): - """ - Create a document for indexing from the text specified by ref/version/lang - """ - # Don't bother indexing if there's no content - if not content: - return False - content = cls.modify_text_in_doc(content) - if len(content) == 0: - return False - - oref = Ref(tref) - - indexed_categories = get_search_categories(oref, categories) - - tp = cls.best_time_period - if tp is not None: - comp_start_date = int(tp.start) - else: - comp_start_date = 3000 # far in the future - - ref_data = RefData().load({"ref": tref}) - pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK - - return { - "ref": tref, - "heRef": heTref, - "version": version, - "lang": lang, - "version_priority": version_priority if version_priority is not None else 1000, - "titleVariants": oref.index_node.all_tree_titles("en"), - "categories": indexed_categories, - "order": oref.order_id(), - "path": "/".join(indexed_categories + [cls.curr_index.title]), - "pagesheetrank": pagesheetrank, - "comp_date": comp_start_date, - #"hebmorph_semi_exact": content, - "exact": content, - "naive_lemmatizer": content, - 'hebrew_version_title': hebrew_version_title, - } - - -def index_sheets_by_timestamp(timestamp): - """ - :param timestamp str: index all sheets modified after `timestamp` (in isoformat) - """ - - name_dict = get_new_and_current_index_names('sheet', debug=False) - curr_index_name = name_dict['current'] - try: - ids = db.sheets.find({"status": "public", "dateModified": {"$gt": timestamp}}).distinct("id") - except Exception as e: - print(e) - return str(e) - - succeeded = [] - failed = [] - - for id in ids: - did_succeed = index_sheet(curr_index_name, id) - if did_succeed: - succeeded += [id] - else: - failed += [id] - - return {"succeeded": {"num": len(succeeded), "ids": succeeded}, "failed": {"num": len(failed), "ids": failed}} - - -def index_public_sheets(index_name): - """ - Index all source sheets that are publicly listed. - """ - ids = db.sheets.find({"status": "public"}).distinct("id") - for id in ids: - index_sheet(index_name, id) - - -def index_public_notes(): - """ - Index all public notes. - - TODO - """ - pass - - -def clear_index(index_name): - """ - Delete the search index. - """ - try: - index_client.delete(index=index_name) - except Exception as e: - print("Error deleting Elasticsearch Index named %s" % index_name) - print(e) - - -def add_ref_to_index_queue(ref, version, lang): - """ - Adds a text to index queue to be indexed later. - """ - qu.IndexQueue({ - "ref": ref, - "lang": lang, - "version": version, - "type": "ref", - }).save() - - return True - - -def index_from_queue(): - """ - Index every ref/version/lang found in the index queue. - Delete queue records on success. - """ - index_name = get_new_and_current_index_names('text')['current'] - queue = db.index_queue.find() - for item in queue: - try: - TextIndexer.index_ref(index_name, Ref(item["ref"]), item["version"], item["lang"], False) - db.index_queue.remove(item) - except Exception as e: - logging.error("Error indexing from queue ({} / {} / {}) : {}".format(item["ref"], item["version"], item["lang"], e)) - - -def add_recent_to_queue(ndays): - """ - Look through the last ndays of the activitiy log, - add to the index queue any refs that had their text altered. - """ - cutoff = datetime.now() - timedelta(days=ndays) - query = { - "date": {"$gt": cutoff}, - "rev_type": {"$in": ["add text", "edit text"]} - } - activity = db.history.find(query) - refs = set() - for a in activity: - refs.add((a["ref"], a["version"], a["language"])) - for ref in list(refs): - add_ref_to_index_queue(ref[0], ref[1], ref[2]) - - -def get_new_and_current_index_names(type, debug=False): - base_index_name_dict = { - 'text': SEARCH_INDEX_NAME_TEXT, - 'sheet': SEARCH_INDEX_NAME_SHEET, - } - index_name_a = "{}-a{}".format(base_index_name_dict[type], '-debug' if debug else '') - index_name_b = "{}-b{}".format(base_index_name_dict[type], '-debug' if debug else '') - alias_name = "{}{}".format(base_index_name_dict[type], '-debug' if debug else '') - aliases = index_client.get_alias() - try: - a_alias = aliases[index_name_a]['aliases'] - choose_a = alias_name not in a_alias - except KeyError: - choose_a = True - - if choose_a: - new_index_name = index_name_a - old_index_name = index_name_b - else: - new_index_name = index_name_b - old_index_name = index_name_a - return {"new": new_index_name, "current": old_index_name, "alias": alias_name} - - -def index_all(skip=0, debug=False): - """ - Fully create the search index from scratch. - """ - start = datetime.now() - index_all_of_type('text', skip=skip, debug=debug) - index_all_of_type('sheet', skip=skip, debug=debug) - end = datetime.now() - db.index_queue.delete_many({}) # index queue is now stale - print("Elapsed time: %s" % str(end-start)) - - -def index_all_of_type(type, skip=0, debug=False): - index_names_dict = get_new_and_current_index_names(type=type, debug=debug) - print('CREATING / DELETING {}'.format(index_names_dict['new'])) - print('CURRENT {}'.format(index_names_dict['current'])) - for i in range(10): - print('STARTING IN T-MINUS {}'.format(10 - i)) - pytime.sleep(1) - - index_all_of_type_by_index_name(type, index_names_dict['new'], skip, debug) - - try: - #index_client.put_settings(index=index_names_dict['current'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) - index_client.delete_alias(index=index_names_dict['current'], name=index_names_dict['alias']) - print("Successfully deleted alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) - except NotFoundError: - print("Failed to delete alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) - - clear_index(index_names_dict['alias']) # make sure there are no indexes with the alias_name - - #index_client.put_settings(index=index_names_dict['new'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) - index_client.put_alias(index=index_names_dict['new'], name=index_names_dict['alias']) - - if index_names_dict['new'] != index_names_dict['current']: - clear_index(index_names_dict['current']) - - -def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False): - if skip == 0: - create_index(index_name, type) - if type == 'text': - TextIndexer.clear_cache() - TextIndexer.index_all(index_name, debug=debug) - elif type == 'sheet': - index_public_sheets(index_name) \ No newline at end of file From e77b5468d1572ca10f020fbbf6a3689c1687e0cd Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Tue, 12 Dec 2023 14:43:15 +0200 Subject: [PATCH 07/17] chore: comment for the sake of git commit --- cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli.py b/cli.py index a39755cb53..a319391923 100644 --- a/cli.py +++ b/cli.py @@ -1,5 +1,5 @@ import django -django.setup() +django.setup() # comment for sake of git commit from sefaria.model import * import sefaria.system.database as database From 3c9a42c38be3490b59b113f61687a2787401a207 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Sun, 7 Jan 2024 11:37:40 +0200 Subject: [PATCH 08/17] fix(Source Editor): Prompts should be textareas not input elements --- static/js/AdminEditor.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/AdminEditor.jsx b/static/js/AdminEditor.jsx index 1d1ecea94a..d761f200b8 100644 --- a/static/js/AdminEditor.jsx +++ b/static/js/AdminEditor.jsx @@ -21,7 +21,7 @@ const options_for_form = { placeholder: "Add a description.", type: 'textarea' }, - "Prompt": {label: "Prompt", field: "prompt", placeholder: "Add a prompt.", textarea: true}, + "Prompt": {label: "Prompt", field: "prompt", placeholder: "Add a prompt.", type: 'textarea'}, "English Short Description": { label: "English Short Description for Table of Contents", field: "enCategoryDescription", placeholder: "Add a short description.", type: 'input' From d3e31c5045cd3316097fa6d1eb21c2a519326ec2 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Sun, 7 Jan 2024 12:28:37 +0200 Subject: [PATCH 09/17] fix(Markdown): markdown in admin editors was not being validated --- static/js/AdminEditor.jsx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/static/js/AdminEditor.jsx b/static/js/AdminEditor.jsx index 1d1ecea94a..d324d6b687 100644 --- a/static/js/AdminEditor.jsx +++ b/static/js/AdminEditor.jsx @@ -13,13 +13,15 @@ const options_for_form = { label: "English Description", field: "enDescription", placeholder: "Add a description.", - type: 'textarea' + type: 'textarea', + markdown: true, }, "Hebrew Description": { label: "Hebrew Description", field: "heDescription", placeholder: "Add a description.", - type: 'textarea' + type: 'textarea', + markdown: true }, "Prompt": {label: "Prompt", field: "prompt", placeholder: "Add a prompt.", textarea: true}, "English Short Description": { @@ -136,7 +138,7 @@ const AdminEditor = ({title, data, close, catMenu, pictureUploader, updateData, const preprocess = async () => { setValidatingLinks(true); for (const x of items) { - if (options_for_form[x]?.is_textarea) { + if (options_for_form[x]?.markdown) { const field = options_for_form[x].field; const valid_tags = await validateMarkdownLinks(data[field]); if (!valid_tags) { From 556baf6ec1b96c0210e7a8221a1bb7d56ed83260 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Thu, 25 Jan 2024 09:47:39 +0200 Subject: [PATCH 10/17] fix(TranslationBox): avoid sorting translations when there are none. --- static/js/TranslationsBox.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/TranslationsBox.jsx b/static/js/TranslationsBox.jsx index 117145f759..8a1d7e9723 100644 --- a/static/js/TranslationsBox.jsx +++ b/static/js/TranslationsBox.jsx @@ -34,7 +34,7 @@ class TranslationsBox extends Component { let currentVersionsByActualLangs = Sefaria.transformVersionObjectsToByActualLanguageKeys(this.props.currObjectVersions); for(let [lang,ver] of Object.entries(currentVersionsByActualLangs)){ if (!this._excludedLangs.includes(lang)) { - versionsByLang[lang].sort((a, b) => { + versionsByLang[lang]?.sort((a, b) => { return a.versionTitle === ver.versionTitle ? -1 : b.versionTitle === ver.versionTitle ? 1 : 0; }); } From acc4102d712386de2449dc19aa852cc0806ce6aa Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 25 Jan 2024 11:07:43 +0200 Subject: [PATCH 11/17] helm(search): dont set Content-Type header. ES no longer requires this header to be set and it's problematic to set it to application/json because ES has a strange requirement that if Accept is set to "application/vnd.elasticsearch+json; compatible-with=8" (which is sent from our Python ES client) than Content-Type needs to be set to the same thing. --- helm-chart/sefaria-project/templates/configmap/nginx.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm-chart/sefaria-project/templates/configmap/nginx.yaml b/helm-chart/sefaria-project/templates/configmap/nginx.yaml index 0c2fba7288..9841b891db 100644 --- a/helm-chart/sefaria-project/templates/configmap/nginx.yaml +++ b/helm-chart/sefaria-project/templates/configmap/nginx.yaml @@ -117,7 +117,6 @@ data: # allow urls which aren't caught by regex above location /api/search/ { rewrite ^/(?:api/search)/(.*)$ /$1 break; - proxy_set_header Content-Type application/json; # es 6.0 requires this header proxy_set_header Authorization "Basic ${ELASTIC_AUTH_HEADER}"; add_header 'Access-Control-Allow-Origin' ''; proxy_pass http://elasticsearch_upstream/; From 9f0bf2348a88dc63f8745d89c64edb6b5f86cb8f Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 25 Jan 2024 12:46:35 +0200 Subject: [PATCH 12/17] fix(search): move encapsulation of SearchTotal to search.js This ensures that both Dicta and Sefaria queries are using the same format of SearchTotal. --- static/js/SearchResultList.jsx | 36 ++++---------------------------- static/js/sefaria/search.js | 20 +++++++++++------- static/js/sefaria/searchTotal.js | 23 ++++++++++++++++++++ 3 files changed, 39 insertions(+), 40 deletions(-) create mode 100644 static/js/sefaria/searchTotal.js diff --git a/static/js/SearchResultList.jsx b/static/js/SearchResultList.jsx index 9c37f9a520..d2b6f65aaa 100644 --- a/static/js/SearchResultList.jsx +++ b/static/js/SearchResultList.jsx @@ -6,7 +6,7 @@ import extend from 'extend'; import classNames from 'classnames'; import $ from './sefaria/sefariaJquery'; import Sefaria from './sefaria/sefaria'; -import { FilterNode } from './sefaria/search'; +import { SearchTotal } from "./sefaria/searchTotal"; import SearchTextResult from './SearchTextResult'; import SearchSheetResult from './SearchSheetResult'; import SearchFilters from './SearchFilters'; @@ -76,34 +76,6 @@ const SearchTopic = (props) => { } -class SearchTotal { - constructor({value=0, relation="eq"} = {}) { - this._value = value; - this._relation = relation; - } - getValue = () => this._value; - add = (num) => this._value += num; - asString = () => `${this._value.addCommas()}${this._getRelationString()}`; - _getRelationString = () => this._relation === 'gte' ? '+' : ''; - combine = (other) => { - if (!(other instanceof SearchTotal)) { - throw new TypeError('Parameter must be an instance of SearchTotal.'); - } - const newValue = this.getValue() + other.getValue(); - let newRelation = this._relation; - if (other._relation === 'gte' || this._relation === 'gte') { - newRelation = 'gte'; - } - return new SearchTotal({value: newValue, relation: newRelation}); - }; -} - - -function createSearchTotal(total) { - return new SearchTotal(total); -} - - class SearchResultList extends Component { constructor(props) { super(props); @@ -130,7 +102,7 @@ class SearchResultList extends Component { //console.log("Loaded cached query for") //console.log(args); this.state.hits[t] = this.state.hits[t].concat(cachedQuery.hits.hits); - this.state.totals[t] = createSearchTotal(cachedQuery.hits.total); + this.state.totals[t] = cachedQuery.hits.total; this.state.pagesLoaded[t] += 1; args.start = this.state.pagesLoaded[t] * this.querySize[t]; if (t === "text") { @@ -350,7 +322,7 @@ class SearchResultList extends Component { args.success = data => { this.updateRunningQuery(type, null); if (this.state.pagesLoaded[type] === 0) { // Skip if pages have already been loaded from cache, but let aggregation processing below occur - const currTotal = createSearchTotal(data.hits.total); + const currTotal = data.hits.total; let state = { hits: extend(this.state.hits, {[type]: data.hits.hits}), totals: extend(this.state.totals, {[type]: currTotal}), @@ -363,7 +335,7 @@ class SearchResultList extends Component { }); const filter_label = (request_applied && request_applied.length > 0) ? (' - ' + request_applied.join('|')) : ''; const query_label = props.query + filter_label; - Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, createSearchTotal(data.hits.total).getValue()); + Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, data.hits.total.getValue()); } if (data.aggregations) { diff --git a/static/js/sefaria/search.js b/static/js/sefaria/search.js index 6506574bed..1dda4f42f7 100644 --- a/static/js/sefaria/search.js +++ b/static/js/sefaria/search.js @@ -2,6 +2,7 @@ import $ from './sefariaJquery'; import extend from 'extend'; import FilterNode from './FilterNode'; import SearchState from './searchState'; +import { SearchTotal } from "./searchTotal"; class Search { @@ -9,8 +10,8 @@ class Search { this.searchIndexText = searchIndexText; this.searchIndexSheet = searchIndexSheet; this._cache = {}; - this.sefariaQueryQueue = {hits: {hits:[], total: 0, max_score: 0.0}, lastSeen: -1}; - this.dictaQueryQueue = {lastSeen: -1, hits: {total: 0, hits:[]}}; + this.sefariaQueryQueue = {hits: {hits:[], total: new SearchTotal(), max_score: 0.0}, lastSeen: -1}; + this.dictaQueryQueue = {lastSeen: -1, hits: {total: new SearchTotal(), hits:[]}}; this.queryDictaFlag = true; this.dictaCounts = null; this.sefariaSheetsResult = null; @@ -47,6 +48,7 @@ class Search { processData: false, dataType: 'json', success: data => { + data.hits.total = new SearchTotal(data.hits.total); this.cache(cacheKey, data); resolve(data) }, @@ -103,7 +105,7 @@ class Search { return new Promise((resolve, reject) => { if (this.queryDictaFlag && args.type === "text") { - if (this.dictaQueryQueue.lastSeen + 1 >= this.dictaQueryQueue.hits.total && ('start' in args && args['start'] > 0)) { + if (this.dictaQueryQueue.lastSeen + 1 >= this.dictaQueryQueue.hits.total.getValue() && ('start' in args && args['start'] > 0)) { /* don't make new queries if results are exhausted. * 'start' is omitted on first query (defaults to 0). On a first query, we'll always want to query. */ @@ -125,6 +127,7 @@ class Search { contentType: 'application/json; charset=UTF-8', data: jsonData, success: data => { + data.total = new SearchTotal({value: data.total}); this.cache(cacheKey, data); resolve(data); }, @@ -134,7 +137,7 @@ class Search { } else { - resolve({total: 0, hits: []}); + resolve({total: new SearchTotal(), hits: []}); } }).then(x => { if (args.type === "sheet") { @@ -244,15 +247,16 @@ class Search { } if (!!filters.length) { const expression = new RegExp(`^(${filters.join('|')})(\/.*|$)`); - result.hits.total = this.buckets.reduce((total, currentBook) => { + const accumulatedTotal = this.buckets.reduce((total, currentBook) => { if (expression.test(currentBook.key)) { total += currentBook.doc_count; } return total }, 0); + result.hits.total = new SearchTotal({value: accumulatedTotal}); } else { - result.hits.total = this.sefariaQueryQueue.hits.total + this.dictaQueryQueue.hits.total; + result.hits.total = this.sefariaQueryQueue.hits.total.combine(this.dictaQueryQueue.hits.total); } let sefariaHits = (this.queryDictaFlag) @@ -327,8 +331,8 @@ class Search { if (args.type === 'text') { this.dictaCounts = null; this.queryDictaFlag = this.isDictaQuery(args); - this.sefariaQueryQueue = {hits: {hits: [], total: 0, max_score: 0.0}, lastSeen: -1}; - this.dictaQueryQueue = {lastSeen: -1, hits: {total: 0, hits: []}}; + this.sefariaQueryQueue = {hits: {hits: [], total: new SearchTotal(), max_score: 0.0}, lastSeen: -1}; + this.dictaQueryQueue = {lastSeen: -1, hits: {total: new SearchTotal(), hits: []}}; this.queryAborter.abort(); } } diff --git a/static/js/sefaria/searchTotal.js b/static/js/sefaria/searchTotal.js new file mode 100644 index 0000000000..e07f5772ad --- /dev/null +++ b/static/js/sefaria/searchTotal.js @@ -0,0 +1,23 @@ +export class SearchTotal { + constructor({value=0, relation="eq"} = {}) { + this._value = value; + this._relation = relation; + } + getValue = () => this._value; + add = (num) => this._value += num; + asString = () => `${this._value.addCommas()}${this._getRelationString()}`; + _getRelationString = () => this._relation === 'gte' ? '+' : ''; + combine = (other) => { + if (!(other instanceof SearchTotal)) { + throw new TypeError('Parameter must be an instance of SearchTotal.'); + } + const newValue = this.getValue() + other.getValue(); + let newRelation = this._relation; + if (other._relation === 'gte' || this._relation === 'gte') { + newRelation = 'gte'; + } + return new SearchTotal({value: newValue, relation: newRelation}); + }; +} + + From 6882972aafb8d2e867724111b2a2c93f0e543015 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 25 Jan 2024 13:04:16 +0200 Subject: [PATCH 13/17] chore(search): split up long line --- static/js/sefaria/search.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/static/js/sefaria/search.js b/static/js/sefaria/search.js index 1dda4f42f7..4000acea6d 100644 --- a/static/js/sefaria/search.js +++ b/static/js/sefaria/search.js @@ -105,7 +105,8 @@ class Search { return new Promise((resolve, reject) => { if (this.queryDictaFlag && args.type === "text") { - if (this.dictaQueryQueue.lastSeen + 1 >= this.dictaQueryQueue.hits.total.getValue() && ('start' in args && args['start'] > 0)) { + if (this.dictaQueryQueue.lastSeen + 1 >= this.dictaQueryQueue.hits.total.getValue() && + ('start' in args && args['start'] > 0)) { /* don't make new queries if results are exhausted. * 'start' is omitted on first query (defaults to 0). On a first query, we'll always want to query. */ From ab37239d611505767e52def465346b19bf986747 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sat, 27 Jan 2024 22:39:54 +0200 Subject: [PATCH 14/17] fix(linker): fix order of parameters --- sefaria/helper/linker.py | 2 +- sefaria/helper/tests/linker_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 998f346a05..ceb5fc15a0 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -52,8 +52,8 @@ class _FindRefsTextOptions: @attr version_preferences_by_corpus: dict of dicts of the form { : { : }} """ - debug: bool = False with_text: bool = False + debug: bool = False max_segments: int = 0 version_preferences_by_corpus: dict = None diff --git a/sefaria/helper/tests/linker_test.py b/sefaria/helper/tests/linker_test.py index 4a0b96c41a..94835d2a09 100644 --- a/sefaria/helper/tests/linker_test.py +++ b/sefaria/helper/tests/linker_test.py @@ -131,8 +131,8 @@ def test_find_refs_text(self, mock_is_hebrew: Mock): assert find_refs_text.lang == 'en' def test_find_refs_text_options(self): - find_refs_text_options = linker._FindRefsTextOptions(True, True, 10, {}) - assert find_refs_text_options.debug + find_refs_text_options = linker._FindRefsTextOptions(True, False, 10, {}) + assert not find_refs_text_options.debug assert find_refs_text_options.with_text assert find_refs_text_options.max_segments == 10 assert find_refs_text_options.version_preferences_by_corpus == {} From 600dc62d3390a606f6bb2aa52e76afa4da63c91c Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Tue, 30 Jan 2024 12:58:23 +0200 Subject: [PATCH 15/17] chore(alt tags): add alt tags for globe, bookmarks and notifications icons. --- static/js/Header.jsx | 23 +++++++++++++++++++---- static/js/Misc.jsx | 4 +++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/static/js/Header.jsx b/static/js/Header.jsx index f252297076..42f76a8f53 100644 --- a/static/js/Header.jsx +++ b/static/js/Header.jsx @@ -16,6 +16,21 @@ import { } from './Misc'; +function getAltText(name) { + const alts = { + 'notifications': { + 'english': 'Notifications', + 'hebrew': 'עדכונים' + }, + 'bookmarks': { + 'english': 'Bookmarks', + 'hebrew': 'מועדפים' + }, + }; + return alts[name]?.[Sefaria.interfaceLang]; +} + + class Header extends Component { constructor(props) { super(props) @@ -439,10 +454,10 @@ const LoggedInButtons = ({headerMode}) => { return (
- + {getAltText('bookmarks')}/ - + {getAltText('notifications')} { Sefaria._siteSettings.TORAH_SPECIFIC ? : null} @@ -502,11 +517,11 @@ const MobileNavMenu = ({onRefClick, showSearch, openTopic, openURL, close, visib Profile - + {getAltText('bookmarks')} Saved & History - + {getAltText('notifications')} Notifications : null } diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index d5fab55bbc..11d6dc37ec 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1413,9 +1413,11 @@ function InterfaceLanguageMenu({currentLang, translationLanguagePreference, setT }; }, []); + const globeAlt = (Sefaria.interfaceLang === 'english') ? 'Toggle Interface Language Menu' : 'פתח תפריט שפת ממשק'; + return (
- + {globeAlt}/
Site Language From d61a408f3d21c5760f431b5aae3118f57679080e Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Tue, 30 Jan 2024 15:47:58 +0200 Subject: [PATCH 16/17] refactor(alt tags): move strings to strings.js. --- static/js/Header.jsx | 23 ++++------------------- static/js/Misc.jsx | 4 +--- static/js/sefaria/strings.js | 5 +++++ 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/static/js/Header.jsx b/static/js/Header.jsx index 42f76a8f53..c793f66ea1 100644 --- a/static/js/Header.jsx +++ b/static/js/Header.jsx @@ -16,21 +16,6 @@ import { } from './Misc'; -function getAltText(name) { - const alts = { - 'notifications': { - 'english': 'Notifications', - 'hebrew': 'עדכונים' - }, - 'bookmarks': { - 'english': 'Bookmarks', - 'hebrew': 'מועדפים' - }, - }; - return alts[name]?.[Sefaria.interfaceLang]; -} - - class Header extends Component { constructor(props) { super(props) @@ -454,10 +439,10 @@ const LoggedInButtons = ({headerMode}) => { return (
- {getAltText('bookmarks')}/ + {Sefaria._('Bookmarks')}/ - {getAltText('notifications')} + {Sefaria._('Notifications')} { Sefaria._siteSettings.TORAH_SPECIFIC ? : null} @@ -517,11 +502,11 @@ const MobileNavMenu = ({onRefClick, showSearch, openTopic, openURL, close, visib Profile - {getAltText('bookmarks')} + {Sefaria._('Bookmarks')} Saved & History - {getAltText('notifications')} + {Sefaria._('Notifications')} Notifications : null } diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index 11d6dc37ec..4abc2a61b3 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1413,11 +1413,9 @@ function InterfaceLanguageMenu({currentLang, translationLanguagePreference, setT }; }, []); - const globeAlt = (Sefaria.interfaceLang === 'english') ? 'Toggle Interface Language Menu' : 'פתח תפריט שפת ממשק'; - return (
- {globeAlt}/ + {Sefaria._('Toggle
Site Language diff --git a/static/js/sefaria/strings.js b/static/js/sefaria/strings.js index e04a9e1ee5..e37212bf9d 100644 --- a/static/js/sefaria/strings.js +++ b/static/js/sefaria/strings.js @@ -536,6 +536,11 @@ const Strings = { 'Citing': 'מצטט', 'Sites that are listed here use the': 'אתרים המפורטים כאן משתמשים', 'Sefaria Linker': 'במרשתת ההפניות', + + //alt tags + 'Notifications': 'עדכונים', + 'Bookmarks': 'שמורים', + 'Toggle Interface Language Menu' : 'פתח תפריט שפת ממשק', }, _i18nInterfaceStringsWithContext: { From 9f6b71eb840e7fdbdc2bc50368e0639e9004831f Mon Sep 17 00:00:00 2001 From: Russel Neiss Date: Tue, 30 Jan 2024 14:14:54 -0600 Subject: [PATCH 17/17] fix(api): patches a bug where setting a url param to false or 0 would return true at certain API endpoints. --- api/views.py | 2 +- reader/views.py | 22 +++++++++++----------- sourcesheets/views.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/api/views.py b/api/views.py index 0e714da047..0b5a3a5202 100644 --- a/api/views.py +++ b/api/views.py @@ -48,7 +48,7 @@ def get(self, request, *args, **kwargs): if not versions_params: versions_params = ['primary'] versions_params = [self.split_piped_params(param_str) for param_str in versions_params] - fill_in_missing_segments = request.GET.get('fill_in_missing_segments', False) + fill_in_missing_segments = bool(int(request.GET.get('fill_in_missing_segments', False))) return_format = request.GET.get('return_format', 'default') if return_format not in self.RETURN_FORMATS: return jsonResponse({'error': f'return_format should be one of those formats: {self.RETURN_FORMATS}.'}, status=400) diff --git a/reader/views.py b/reader/views.py index 59d30236db..1f8cf7386a 100644 --- a/reader/views.py +++ b/reader/views.py @@ -1645,10 +1645,10 @@ def index_api(request, title, raw=False): API for manipulating text index records (aka "Text Info") """ if request.method == "GET": - with_content_counts = bool(request.GET.get("with_content_counts", False)) + with_content_counts = bool(int(request.GET.get("with_content_counts", False))) i = library.get_index(title).contents(raw=raw, with_content_counts=with_content_counts) - if request.GET.get("with_related_topics", False): + if bool(int(request.GET.get("with_related_topics", False))): i["relatedTopics"] = get_topics_for_book(title, annotate=True) return jsonResponse(i, callback=request.GET.get("callback", None)) @@ -1862,7 +1862,7 @@ def _collapse_book_leaf_shapes(leaf_shapes): else: cat_list = title.split("/") depth = request.GET.get("depth", 2) - include_dependents = request.GET.get("dependents", False) + include_dependents = bool(int(request.GET.get("dependents", False))) indexes = [] if len(cat_list) == 1: # try as corpus @@ -2067,7 +2067,7 @@ def notes_api(request, note_id_or_ref): raise Http404 oref = Ref(note_id_or_ref) cb = request.GET.get("callback", None) - private = request.GET.get("private", False) + private = bool(int(request.GET.get("private", False))) res = get_notes(oref, uid=creds["user_id"], public=(not private)) return jsonResponse(res, cb) @@ -2141,7 +2141,7 @@ def protected_note_post(req): @catch_error_as_json def all_notes_api(request): - private = request.GET.get("private", False) + private = bool(int(request.GET.get("private", False))) if private: if not request.user.is_authenticated: res = {"error": "You must be logged in to access you notes."} @@ -2157,17 +2157,17 @@ def related_api(request, tref): """ Single API to bundle available content related to `tref`. """ - if request.GET.get("private", False) and request.user.is_authenticated: + if bool(int(request.GET.get("private", False))) and request.user.is_authenticated: oref = Ref(tref) response = { "sheets": get_sheets_for_ref(tref, uid=request.user.id), "notes": get_notes(oref, uid=request.user.id, public=False) } - elif request.GET.get("private", False) and not request.user.is_authenticated: + elif bool(int(request.GET.get("private", False))) and not request.user.is_authenticated: response = {"error": "You must be logged in to access private content."} else: response = { - "links": get_links(tref, with_text=False, with_sheet_links=request.GET.get("with_sheet_links", False)), + "links": get_links(tref, with_text=False, with_sheet_links=bool(int(request.GET.get("with_sheet_links", False)))), "sheets": get_sheets_for_ref(tref), "notes": [], # get_notes(oref, public=True) # Hiding public notes for now "webpages": get_webpages_for_ref(tref), @@ -2660,7 +2660,7 @@ def name_api(request, name): name = name[1:] if topic_override else name # Number of results to return. 0 indicates no limit LIMIT = int(request.GET.get("limit", 10)) - ref_only = request.GET.get("ref_only", False) + ref_only = bool(int(request.GET.get("ref_only", False))) completions_dict = get_name_completions(name, LIMIT, ref_only, topic_override) ref = completions_dict["ref"] topic = completions_dict["topic"] @@ -2764,7 +2764,7 @@ def user_stats_api(request, uid): assert request.method == "GET", "Unsupported Method" u = request.user assert (u.is_active and u.is_staff) or (int(uid) == u.id) - quick = bool(request.GET.get("quick", False)) + quick = bool(int(request.GET.get("quick", False))) if quick: return jsonResponse(public_user_data(uid)) return jsonResponse(user_stats_data(uid)) @@ -4599,7 +4599,7 @@ def isNodeJsReachable(): except Exception as e: logger.warn(f"Failed node healthcheck. Error: {e}") return False - + def is_database_reachable(): try: from sefaria.system.database import db diff --git a/sourcesheets/views.py b/sourcesheets/views.py index ea5d874d72..1340dc9111 100644 --- a/sourcesheets/views.py +++ b/sourcesheets/views.py @@ -960,7 +960,7 @@ def all_sheets_api(request, limiter, offset=0): limiter = int(limiter) offset = int(offset) lang = request.GET.get("lang") - filtered = request.GET.get("filtered", False) + filtered = bool(int(request.GET.get("filtered", False))) response = public_sheets(limit=limiter, skip=offset, lang=lang, filtered=filtered) response = jsonResponse(response, callback=request.GET.get("callback", None)) return response