diff --git a/build/ci/integration-values.yaml b/build/ci/integration-values.yaml index cbdf7b6fd0..e8619f44c7 100644 --- a/build/ci/integration-values.yaml +++ b/build/ci/integration-values.yaml @@ -57,7 +57,7 @@ localSettings: DEBUG: true DOMAIN_LANGUAGE: {} APSCHEDULER_NAME: "apscheduler-{{ .Values.deployEnv }}" - SEARCH_ADMIN: "http://elasticsearch-data:9200" + SEARCH_URL: "http://elasticsearch-data:9200" TURN_SERVER: '' USE_CLOUDFLARE: false FRONT_END_URL: "http://${NAME}.integration.sefaria.org" diff --git a/build/ci/production-values.yaml b/build/ci/production-values.yaml index 01dc60245b..e0fcd54414 100644 --- a/build/ci/production-values.yaml +++ b/build/ci/production-values.yaml @@ -163,6 +163,7 @@ instrumentation: enabled: false otelEndpoint: "http://otel-collector-collector.monitoring:4317" jaegerEndpoint: "jaeger-agent-dev.monitoring.svc.cluster.local:6831" + localSettings: DEBUG: false DOMAIN_LANGUAGE: { diff --git a/build/ci/sandbox-values.yaml b/build/ci/sandbox-values.yaml index 776d232677..12dcce6766 100644 --- a/build/ci/sandbox-values.yaml +++ b/build/ci/sandbox-values.yaml @@ -53,7 +53,7 @@ localSettings: DEBUG: false DOMAIN_LANGUAGE: {} APSCHEDULER_NAME: "apscheduler-{{ .Values.deployEnv }}" - SEARCH_ADMIN: "http://elasticsearch-data:9200" + SEARCH_URL: "http://elasticsearch-data:9200" TURN_SERVER: '' USE_CLOUDFLARE: false FRONT_END_URL: "http://${NAME}.cauldron.sefaria.org" diff --git a/helm-chart/sefaria-project/templates/_helpers.tpl b/helm-chart/sefaria-project/templates/_helpers.tpl index 0e7c9ecb57..ce020c7bb7 100644 --- a/helm-chart/sefaria-project/templates/_helpers.tpl +++ b/helm-chart/sefaria-project/templates/_helpers.tpl @@ -54,6 +54,22 @@ elastic-certificate-{{ .Values.deployEnv }} {{- end }} {{- end }} +{{- define "sefaria.secrets.elasticUser" }} +{{- if .Values.secrets.elasticUser.ref -}} +{{- .Values.secrets.elasticUser.ref }} +{{- else -}} +elastic-user-{{ .Values.deployEnv }} +{{- end }} +{{- end }} + +{{- define "sefaria.secrets.elasticAdmin" }} +{{- if .Values.secrets.elasticAdmin.ref -}} +{{- .Values.secrets.elasticAdmin.ref }} +{{- else -}} +elastic-admin-{{ .Values.deployEnv }} +{{- end }} +{{- end }} + {{- define "sefaria.secrets.originTls" }} {{- if .Values.ingress.secrets.originTls.ref -}} diff --git a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml index 4a3a9bb6ac..3c57402e4e 100644 --- a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml +++ b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml @@ -136,16 +136,11 @@ data: } SERVER_EMAIL = os.getenv("SERVER_EMAIL") - SEARCH_HOST = "/api/search" - SEARCH_ADMIN = os.getenv("SEARCH_ADMIN") - SEARCH_ADMIN_USER = os.getenv("SEARCH_ADMIN_USER") - SEARCH_ADMIN_PW = os.getenv("SEARCH_ADMIN_PW") - SEARCH_ADMIN_K8S = os.getenv("SEARCH_ADMIN_K8S") + auth_str = f'{os.getenv("ELASTIC_USERNAME")}:{os.getenv("ELASTIC_PASSWORD")}@' if os.getenv("ELASTIC_USERNAME") else '' + SEARCH_URL = f'http://{auth_str}{os.getenv("SEARCH_HOST")}:9200' SEARCH_INDEX_ON_SAVE = True - SEARCH_INDEX_NAME = "sefaria" SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' - SEARCH_INDEX_NAME_MERGED = 'merged' TURN_SERVER = os.getenv("TURN_SERVER") #coturn.cauldron.sefaria.org TURN_SECRET= os.getenv("TURN_SECRET") diff --git a/helm-chart/sefaria-project/templates/configmap/local-settings.yaml b/helm-chart/sefaria-project/templates/configmap/local-settings.yaml index ffe59971d5..aa39d2ddc4 100644 --- a/helm-chart/sefaria-project/templates/configmap/local-settings.yaml +++ b/helm-chart/sefaria-project/templates/configmap/local-settings.yaml @@ -9,7 +9,6 @@ data: DEBUG: "{{ .Values.localSettings.DEBUG }}" DOMAIN_LANGUAGE: {{ .Values.localSettings.DOMAIN_LANGUAGE | toJson | quote }} APSCHEDULER_NAME: {{ tpl .Values.localSettings.APSCHEDULER_NAME . | quote }} - SEARCH_ADMIN: "http://{{ .Values.nginx.SEARCH_HOST }}:9200" TURN_SERVER: {{ .Values.localSettings.TURN_SERVER | quote }} USE_CLOUDFLARE: "{{ .Values.localSettings.USE_CLOUDFLARE }}" FRONT_END_URL: {{ .Values.localSettings.FRONT_END_URL | quote }} @@ -26,3 +25,4 @@ data: SENTRY_ENVIRONMENT: {{ .Values.deployEnv | quote }} SENTRY_CODE_VERSION: {{ .Values.web.containerImage.tag }} FAIL_GRACEFULLY: "{{ .Values.localSettings.FAIL_GRACEFULLY }}" + SEARCH_HOST: {{ .Values.nginx.SEARCH_HOST | quote }} diff --git a/helm-chart/sefaria-project/templates/configmap/nginx.yaml b/helm-chart/sefaria-project/templates/configmap/nginx.yaml index 4a167845e9..0c2fba7288 100644 --- a/helm-chart/sefaria-project/templates/configmap/nginx.yaml +++ b/helm-chart/sefaria-project/templates/configmap/nginx.yaml @@ -6,7 +6,43 @@ metadata: deployEnv: "{{ .Values.deployEnv }}" {{- include "sefaria.labels" . | nindent 4 }} data: + {{- if .Values.instrumentation.enabled }} + opentracing.json: |- + { + "service_name": "nginx-{{ .Values.deployEnv }}", + "propagation_format": "jaeger", + "sampler": { + "type": "const", + "param": 1, + "samplingServerURL": "http://127.0.0.1:5778/sampling" + }, + "reporter": { + "endpoint": "", + "localAgentHostPort": "{{ .Values.instrumentation.jaegerEndpoint }}" + }, + "headers": { + "TraceContextHeaderName": "", + "jaegerDebugHeader": "", + "jaegerBaggageHeader": "", + "traceBaggageHeaderPrefix": "" + } + } + {{- end }} + entrypoint.sh: | + #!/bin/bash + + set -e + + export ELASTIC_AUTH_HEADER=$(echo -n $ELASTIC_USERNAME:$ELASTIC_PASSWORD | base64) + envsubst '${ENV_NAME},${VARNISH_HOST},${SEARCH_HOST},${RELEASE_TAG},${STRAPI_LOCATION},${ELASTIC_AUTH_HEADER}{{- if .Values.linker.enabled }},${LINKER_HOST}{{- end }}{{- if .Values.instrumentation.enabled }},${NGINX_VERSION}{{- end }}' < /conf/nginx.template.conf > /nginx.conf + + nginx -c /nginx.conf -g 'daemon off;' + nginx.template.conf: |- + {{- if .Values.instrumentation.enabled }} + load_module /etc/nginx/modules/ngx_http_opentracing_module.so; + {{- end }} + user www-data; worker_processes 8; error_log /var/log/nginx/error.log warn; @@ -18,6 +54,10 @@ data: } http { + {{- if .Values.instrumentation.enabled }} + opentracing_load_tracer /usr/local/lib/libjaegertracing_plugin.so /etc/nginx/opentracing.json; + {{- end }} + # https://nginx.org/en/docs/varindex.html log_format structured '{ "requestDuration": $request_time, "envName": "${ENV_NAME}", "stackComponent": "nginx", "host": "$hostname", "severity": "info", "httpRequest": { "requestMethod": "$request_method", "requestUrl": "$request_uri", "requestSize": $request_length, "status": $status, "responseSize": $body_bytes_sent, "userAgent": "$http_user_agent", "remoteIp": "$http_x_original_forwarded_for", "referer": "$http_referer", "latency": ${request_time}s, "protocol": "$server_protocol", "forwardedHTTP": "$http_x_forwarded_proto" }, "remoteUser": "$remote_user", "timeLocal": "$time_local" }'; access_log /dev/stdout structured; @@ -78,6 +118,7 @@ data: location /api/search/ { rewrite ^/(?:api/search)/(.*)$ /$1 break; proxy_set_header Content-Type application/json; # es 6.0 requires this header + proxy_set_header Authorization "Basic ${ELASTIC_AUTH_HEADER}"; add_header 'Access-Control-Allow-Origin' ''; proxy_pass http://elasticsearch_upstream/; } @@ -108,6 +149,10 @@ data: } location / { + {{- if .Values.instrumentation.enabled }} + opentracing on; + opentracing_propagate_context; + {{- end }} proxy_send_timeout 300; proxy_read_timeout 300; proxy_set_header Host $host; diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml new file mode 100644 index 0000000000..9d15f9fb38 --- /dev/null +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml @@ -0,0 +1,77 @@ +{{- if .Values.cronJobs.reindexElasticSearch.enabled }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Values.deployEnv }}-reindex-elastic-search-es6 + labels: + {{- include "sefaria.labels" . | nindent 4 }} +spec: + schedule: "20 13 * * 0" + jobTemplate: + spec: + backoffLimit: 1 + template: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - mongo + topologyKey: kubernetes.io.hostname + containers: + - name: reindex-elastic-search-es6 + image: "{{ .Values.web.containerImage.imageRegistry }}:{{ .Values.web.containerImage.tag }}" + resources: + limits: + memory: 9Gi + requests: + memory: 7Gi + env: + - name: SEARCH_HOST + value: "{{ .Values.cronjobs.reindexElasticSearch.SEARCH_HOST_ES6 }}" + - name: REDIS_HOST + value: "redis-{{ .Values.deployEnv }}" + - name: NODEJS_HOST + value: "node-{{ .Values.deployEnv }}-{{ .Release.Revision }}" + - name: VARNISH_HOST + value: "varnish-{{ .Values.deployEnv }}-{{ .Release.Revision }}" + - name: SLACK_URL + valueFrom: + secretKeyRef: + name: {{ template "sefaria.secrets.slackWebhook" . }} + key: slack-webhook + envFrom: + - secretRef: + name: {{ .Values.secrets.localSettings.ref }} + optional: true + - configMapRef: + name: local-settings-{{ .Values.deployEnv }} + - secretRef: + name: local-settings-secrets-{{ .Values.deployEnv }} + optional: true + volumeMounts: + - mountPath: /app/sefaria/local_settings.py + name: local-settings + subPath: local_settings.py + readOnly: true + command: ["bash"] + args: [ + "-c", + "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob_ES6.py" + ] + restartPolicy: Never + volumes: + - name: local-settings + configMap: + name: local-settings-file-{{ .Values.deployEnv }} + items: + - key: local_settings.py + path: local_settings.py + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 2 +{{- end }} diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml index ec017b4ca6..72f4f0efb0 100644 --- a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml @@ -7,7 +7,7 @@ metadata: labels: {{- include "sefaria.labels" . | nindent 4 }} spec: - schedule: "20 13 * * 0" + schedule: "20 13 * * 2" jobTemplate: spec: backoffLimit: 1 @@ -32,6 +32,8 @@ spec: requests: memory: 7Gi env: + - name: SEARCH_HOST + value: "{{ .Values.cronJobs.reindexElasticSearch.SEARCH_HOST_ES8 }}" - name: REDIS_HOST value: "redis-{{ .Values.deployEnv }}" - name: NODEJS_HOST @@ -44,14 +46,16 @@ spec: name: {{ template "sefaria.secrets.slackWebhook" . }} key: slack-webhook envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticAdmin" . }} - secretRef: name: {{ .Values.secrets.localSettings.ref }} optional: true + - configMapRef: + name: local-settings-{{ .Values.deployEnv }} - secretRef: name: local-settings-secrets-{{ .Values.deployEnv }} optional: true - - configMapRef: - name: local-settings-{{ .Values.deployEnv }} volumeMounts: - mountPath: /app/sefaria/local_settings.py name: local-settings @@ -60,7 +64,7 @@ spec: command: ["bash"] args: [ "-c", - "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" + "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy elasticsearch==8.8.2 git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" ] restartPolicy: Never volumes: diff --git a/helm-chart/sefaria-project/templates/rollout/nginx.yaml b/helm-chart/sefaria-project/templates/rollout/nginx.yaml index b2f926d5d7..e2e2334473 100644 --- a/helm-chart/sefaria-project/templates/rollout/nginx.yaml +++ b/helm-chart/sefaria-project/templates/rollout/nginx.yaml @@ -52,10 +52,7 @@ spec: - name: nginx image: "{{ .Values.nginx.containerImage.imageRegistry }}:{{ .Values.nginx.containerImage.tag }}" imagePullPolicy: Always - command: ["bash", "-c"] - # https://serverfault.com/questions/577370/how-can-i-use-environment-variables-in-nginx-conf - args: [ "envsubst '${ENV_NAME},${VARNISH_HOST},${SEARCH_HOST},${RELEASE_TAG}{{- if .Values.linker.enabled }},${LINKER_HOST}{{- end }}' < /conf/nginx.template.conf > /nginx.conf && exec nginx -c /nginx.conf -g 'daemon off;'" ] - args: [ "envsubst '${ENV_NAME},${VARNISH_HOST},${SEARCH_HOST},${RELEASE_TAG},${STRAPI_LOCATION}{{- if .Values.linker.enabled }},${LINKER_HOST}{{- end }}{{- if .Values.instrumentation.enabled }},${NGINX_VERSION}{{- end }}' < /conf/nginx.template.conf > /nginx.conf && exec nginx -c /nginx.conf -g 'daemon off;'" ] + command: ["bash", "-c", "/usr/src/entrypoint.sh"] ports: - containerPort: 80 - containerPort: 443 @@ -73,9 +70,18 @@ spec: periodSeconds: 30 resources: {{ toYaml .Values.nginx.resources | nindent 10 }} volumeMounts: - - mountPath: /conf + - mountPath: /conf/nginx.template.conf name: nginx-conf + subPath: nginx.template.conf readOnly: true + - mountPath: /usr/src/entrypoint.sh + name: nginx-conf + subPath: entrypoint.sh + {{- if .Values.instrumentation.enabled }} + - mountPath: /etc/nginx/opentracing.json + name: nginx-conf + subPath: opentracing.json + {{- end }} - mountPath: /app/robots.txt name: robots-txt readOnly: true @@ -101,6 +107,8 @@ spec: value: "linker-{{ .Values.deployEnv }}-{{ .Release.Revision }}" {{- end }} envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticUser" . }} - configMapRef: name: local-settings-nginx-{{ .Values.deployEnv }} optional: true @@ -111,6 +119,7 @@ spec: - name: nginx-conf configMap: name: nginx-conf-{{ .Values.deployEnv }} + defaultMode: 0755 - name: robots-txt configMap: name: robots-txt-{{ .Values.deployEnv }} diff --git a/helm-chart/sefaria-project/templates/rollout/web.yaml b/helm-chart/sefaria-project/templates/rollout/web.yaml index 06b0a24462..094ce34bb3 100644 --- a/helm-chart/sefaria-project/templates/rollout/web.yaml +++ b/helm-chart/sefaria-project/templates/rollout/web.yaml @@ -98,6 +98,8 @@ spec: - name: HELM_REVISION value: "{{ .Release.Revision }}" envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticUser" . }} - secretRef: name: {{ .Values.secrets.localSettings.ref }} optional: true diff --git a/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml b/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml new file mode 100644 index 0000000000..d6a3266af9 --- /dev/null +++ b/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml @@ -0,0 +1,11 @@ +{{- if .Values.secrets.elasticAdmin.data }} +apiVersion: v1 +kind: Secret +metadata: + name: elastic-admin-{{ .Values.deployEnv }} + labels: + deployEnv: "{{ .Values.deployEnv }}" + {{- include "sefaria.labels" . | nindent 4 }} +type: Opaque +stringData: {{ .Values.secrets.elasticAdmin.data | toYaml | nindent 2 }} +{{- end }} diff --git a/helm-chart/sefaria-project/templates/secret/elastic-user.yaml b/helm-chart/sefaria-project/templates/secret/elastic-user.yaml new file mode 100644 index 0000000000..511d271a26 --- /dev/null +++ b/helm-chart/sefaria-project/templates/secret/elastic-user.yaml @@ -0,0 +1,11 @@ +{{- if .Values.secrets.elasticUser.data }} +apiVersion: v1 +kind: Secret +metadata: + name: elastic-user-{{ .Values.deployEnv }} + labels: + deployEnv: "{{ .Values.deployEnv }}" + {{- include "sefaria.labels" . | nindent 4 }} +type: Opaque +stringData: {{ .Values.secrets.elasticUser.data | toYaml | nindent 2 }} +{{- end }} diff --git a/helm-chart/sefaria-project/values.yaml b/helm-chart/sefaria-project/values.yaml index b05cda74b5..69b69f4d91 100644 --- a/helm-chart/sefaria-project/values.yaml +++ b/helm-chart/sefaria-project/values.yaml @@ -329,9 +329,7 @@ secrets: # SEFARIA_DB: # SEFARIA_DB_USER: # SEFARIA_DB_PASSWORD: - # SEARCH_ADMIN_USER: - # SEARCH_ADMIN_PW: - # SEARCH_ADMIN_K8S: + # SEARCH_URL # TURN_SECRET: # TURN_USER: # SEFARIA_BOT_API_KEY: @@ -372,6 +370,16 @@ secrets: # should be commented out and vice-versa. ref: trello-secret # data: + elasticUser: + # If you're using a reference to an existing secret then the data: section + # should be commented out and vice-versa. + ref: elastic-user + # data: + elasticAdmin: + # If you're using a reference to an existing secret then the data: section + # should be commented out and vice-versa. + ref: elastic-admin + # data: # Settings for various cronjobs diff --git a/reader/views.py b/reader/views.py index ccd75c2dc4..a25c76a343 100644 --- a/reader/views.py +++ b/reader/views.py @@ -48,7 +48,7 @@ from sefaria.utils.util import text_preview, short_to_long_lang_code, epoch_time from sefaria.utils.hebrew import hebrew_term, has_hebrew from sefaria.utils.calendars import get_all_calendar_items, get_todays_calendar_items, get_keyed_calendar_items, get_parasha, get_todays_parasha -from sefaria.settings import STATIC_URL, USE_VARNISH, USE_NODE, NODE_HOST, DOMAIN_LANGUAGES, MULTISERVER_ENABLED, SEARCH_ADMIN, MULTISERVER_REDIS_SERVER, \ +from sefaria.settings import STATIC_URL, USE_VARNISH, USE_NODE, NODE_HOST, DOMAIN_LANGUAGES, MULTISERVER_ENABLED, MULTISERVER_REDIS_SERVER, \ MULTISERVER_REDIS_PORT, MULTISERVER_REDIS_DB, DISABLE_AUTOCOMPLETER, ENABLE_LINKER from sefaria.site.site_settings import SITE_SETTINGS from sefaria.system.multiserver.coordinator import server_coordinator @@ -56,7 +56,7 @@ from sefaria.system.exceptions import InputError, PartialRefInputError, BookNameError, NoVersionFoundError, DictionaryEntryNotFoundError from sefaria.system.cache import django_cache from sefaria.system.database import db -from sefaria.helper.search import get_query_obj, get_es_server_url +from sefaria.helper.search import get_query_obj from sefaria.helper.crm.crm_mediator import CrmMediator from sefaria.search import get_search_categories from sefaria.helper.topic import get_topic, get_all_topics, get_topics_for_ref, get_topics_for_book, \ @@ -4219,19 +4219,29 @@ def dummy_search_api(request): @csrf_exempt -def search_wrapper_api(request): +def search_wrapper_api(request, es6_compat=False): + """ + @param request: + @param es6_compat: True to return API response that's compatible with an Elasticsearch 6 compatible client + @return: + """ + from sefaria.helper.search import get_elasticsearch_client + if request.method == "POST": if "json" in request.POST: j = request.POST.get("json") # using form-urlencoded else: j = request.body # using content-type: application/json j = json.loads(j) - es_client = Elasticsearch(get_es_server_url(admin=True)) + es_client = get_elasticsearch_client() search_obj = Search(using=es_client, index=j.get("type")).params(request_timeout=5) search_obj = get_query_obj(search_obj=search_obj, **j) response = search_obj.execute() if response.success(): - return jsonResponse(response.to_dict(), callback=request.GET.get("callback", None)) + response_json = getattr(response.to_dict(), 'body', response.to_dict()) + if es6_compat and isinstance(response_json['hits']['total'], dict): + response_json['hits']['total'] = response_json['hits']['total']['value'] + return jsonResponse(response_json, callback=request.GET.get("callback", None)) return jsonResponse({"error": "Error with connection to Elasticsearch. Total shards: {}, Shards successful: {}, Timed out: {}".format(response._shards.total, response._shards.successful, response.timed_out)}, callback=request.GET.get("callback", None)) return jsonResponse({"error": "Unsupported HTTP method."}, callback=request.GET.get("callback", None)) diff --git a/scripts/reindex_elasticsearch_cronjob_ES6.py b/scripts/reindex_elasticsearch_cronjob_ES6.py new file mode 100644 index 0000000000..1a3f181eb2 --- /dev/null +++ b/scripts/reindex_elasticsearch_cronjob_ES6.py @@ -0,0 +1,49 @@ +""" +This file is meant to be temporary while we are migrating to elasticsearch 8 +""" +from datetime import datetime +import requests +import traceback +import os +import django +django.setup() +from sefaria.model import * +from sefaria.search_ES6 import index_all +from sefaria.local_settings import SEFARIA_BOT_API_KEY +from sefaria.pagesheetrank import update_pagesheetrank + +""" +Source sheets added after last_sheet_timestamp will be missing from the index process. We want to manually index all +source sheets created after this. Depending on the database being used to index the timestamp will be different. If +running against a production database, last_sheet_timestamp will be the time this script began running. Otherwise, this +value will need to be set to the time at which the last mongo dump was created (assuming the database is using the most +up-to-date mongo dump). +""" +# last_sheet_timestamp = datetime.fromtimestamp(os.path.getmtime("/var/data/sefaria_public/dump/sefaria")).isoformat() +try: + last_sheet_timestamp = datetime.now().isoformat() + update_pagesheetrank() + index_all() + r = requests.post("https://www.sefaria.org/admin/index-sheets-by-timestamp", data={"timestamp": last_sheet_timestamp, "apikey": SEFARIA_BOT_API_KEY}) + if "error" in r.text: + raise Exception("Error when calling admin/index-sheets-by-timestamp API: " + r.text) + else: + print("SUCCESS!", r.text) +except Exception as e: + tb_str = traceback.format_exc() + print("Caught exception") + post_object = { + "icon_emoji": ":facepalm:", + "username": "Reindex ElasticSearch", + "channel": "#engineering-discuss", + "attachments": [ + { + "fallback": tb_str, + "color": "#a30200", + "pretext": "Cronjob Error", + "text": tb_str + } + ] + } + requests.post(os.environ['SLACK_URL'], json=post_object) + raise e diff --git a/sefaria/helper/search.py b/sefaria/helper/search.py index 8fad3a3676..5c37c118ac 100644 --- a/sefaria/helper/search.py +++ b/sefaria/helper/search.py @@ -146,16 +146,7 @@ def make_filter(type, agg_type, agg_key): return Term(**{agg_type: agg_key}) -def get_es_server_url(admin=False): - from sefaria.settings import SEARCH_ADMIN, SEARCH_ADMIN_PW, SEARCH_ADMIN_USER - base_url = SEARCH_ADMIN # if admin else SEARCH_NON_ADMIN # should have option for SEARCH_NON_ADMIN but need to add to local settings - if SEARCH_ADMIN_USER: - match = re.search(r'^(https?://)(.*)$', base_url) - if match: - http, base_url = match.group(1), match.group(2) - else: - http, base_url = "http://", base_url - es_url = f"{http}{SEARCH_ADMIN_USER}:{SEARCH_ADMIN_PW}@{base_url}" - else: - es_url = base_url - return es_url +def get_elasticsearch_client(): + from elasticsearch import Elasticsearch + from sefaria.settings import SEARCH_URL + return Elasticsearch(SEARCH_URL) diff --git a/sefaria/helper/tests/normalization_tests.py b/sefaria/helper/tests/normalization_tests.py index 59e4ed7767..eaff8ff116 100644 --- a/sefaria/helper/tests/normalization_tests.py +++ b/sefaria/helper/tests/normalization_tests.py @@ -102,6 +102,21 @@ def test_nested_itag(): assert text[s:e] == """bullnestedThe.""" +def test_two_steps_normalization(): + test_string = ' This is a {{test}}' + + bracket_normalizer = RegexNormalizer(r'\{\{|}}', r'') + strip_normalizer = RegexNormalizer(r'^\s*|\s*$', r'') + normalizer = NormalizerComposer(steps=[bracket_normalizer, strip_normalizer]) + + mapping = normalizer.get_mapping_after_normalization(test_string, reverse=True) + assert mapping == {0: 1, 11: 3, 17: 5} + orig_inds = [(13, 17)] + new_start, new_end = normalizer.convert_normalized_indices_to_unnormalized_indices(orig_inds, mapping, reverse=True)[0] + normalized_string = normalizer.normalize(test_string) + assert normalized_string[new_start:new_end] == "test" + + def test_word_to_char(): test_string = 'some words go here\n\nhello world' words = ['go', 'here', 'hello'] diff --git a/sefaria/local_settings_ci.py b/sefaria/local_settings_ci.py index de5d56d849..d542f0b58a 100644 --- a/sefaria/local_settings_ci.py +++ b/sefaria/local_settings_ci.py @@ -69,7 +69,7 @@ APSCHEDULER_NAME = "apscheduler" # ElasticSearch server -SEARCH_ADMIN = "http://localhost:9200" +SEARCH_URL = "http://localhost:9200" SEARCH_INDEX_ON_SAVE = False # Whether to send texts and source sheet to Search Host for indexing after save SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' diff --git a/sefaria/local_settings_example.py b/sefaria/local_settings_example.py index 4b94d46c9b..268f30366f 100644 --- a/sefaria/local_settings_example.py +++ b/sefaria/local_settings_example.py @@ -156,10 +156,11 @@ # ElasticSearch server -SEARCH_ADMIN = "localhost:9200" # URL to connect to internal ES server for admin access. Leave off https:// -SEARCH_CLIENT = "http://localhost:9200/api/search" # URL to connect to ES for searching. Can be /api/search Django endpoint which gets proxied to ES server. -SEARCH_ADMIN_PW = None # Optional password to connect to ES server. If no password, leave as `None` -SEARCH_ADMIN_USER = None # Optional user to connect to ES server. If no user, leave as `None` +# URL to connect to ES server. +# Set this to https://sefaria.org/api/search to connect to production search. +# If ElasticSearch server has a password use the following format: http(s)://{username}:{password}@{base_url} +SEARCH_URL = "http://localhost:9200" + SEARCH_INDEX_ON_SAVE = False # Whether to send texts and source sheet to Search Host for indexing after save SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 75397f1280..6926676563 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -169,6 +169,11 @@ def test_resolved_raw_ref_clone(): [crrd(['@ד"א זוטא', '@פרק השלום', '#ג']), ["Tractate Derekh Eretz Zuta, Section on Peace 3"]], [crrd(['@ספר החינוך', '@לך לך', '#ב']), ['Sefer HaChinukh 2']], [crrd(['@ספר החינוך', '#ב']), ['Sefer HaChinukh 2']], + [crrd(['@מורה נבוכים', '#ב', '#מה']), ['Guide for the Perplexed, Part 2:45']], + + # specific books - mishneh torah + [crrd(['@הלכות עבודה זרה', '#ב', '#ג']), ['Mishneh Torah, Foreign Worship and Customs of the Nations 2:3']], + [crrd(['@רמב"ם', '@הלכות רוצח', '#פרק א', '#הלכה י"ד']), ['Mishneh Torah, Murderer and the Preservation of Life 1:14']], #ben yehuda project [crrd(["@בראש'", '#א', '#ב']), ["Genesis 1:2"]], diff --git a/sefaria/search.py b/sefaria/search.py index 8844a3d6a7..805aba1dd0 100644 --- a/sefaria/search.py +++ b/sefaria/search.py @@ -19,7 +19,6 @@ import time as pytime logger = structlog.get_logger(__name__) -from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.helpers import bulk from elasticsearch.exceptions import NotFoundError @@ -30,13 +29,13 @@ from sefaria.system.database import db from sefaria.system.exceptions import InputError from sefaria.utils.util import strip_tags -from sefaria.helper.search import get_es_server_url -from .settings import SEARCH_ADMIN, SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET, STATICFILES_DIRS +from .settings import SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET +from sefaria.helper.search import get_elasticsearch_client from sefaria.site.site_settings import SITE_SETTINGS from sefaria.utils.hebrew import strip_cantillation import sefaria.model.queue as qu -es_client = Elasticsearch(get_es_server_url(admin=True)) +es_client = get_elasticsearch_client() index_client = IndicesClient(es_client) tracer = structlog.get_logger(__name__) diff --git a/sefaria/search_ES6.py b/sefaria/search_ES6.py new file mode 100644 index 0000000000..812610eb07 --- /dev/null +++ b/sefaria/search_ES6.py @@ -0,0 +1,844 @@ +# -*- coding: utf-8 -*- +""" +This file is meant to be temporary while we are migrating to elasticsearch 8 + +search.py - full-text search for Sefaria using ElasticSearch + +Writes to MongoDB Collection: index_queue +""" +import os +from datetime import datetime, timedelta +import re +import bleach +import pymongo + +# To allow these files to be run directly from command line (w/o Django shell) +os.environ['DJANGO_SETTINGS_MODULE'] = "settings" + +import structlog +import logging +from logging import NullHandler +from collections import defaultdict +import time as pytime +logger = structlog.get_logger(__name__) + +from elasticsearch import Elasticsearch +from elasticsearch.client import IndicesClient +from elasticsearch.helpers import bulk +from elasticsearch.exceptions import NotFoundError +from sefaria.model import * +from sefaria.model.text import AbstractIndex, AbstractTextRecord +from sefaria.model.user_profile import user_link, public_user_data +from sefaria.model.collection import CollectionSet +from sefaria.system.database import db +from sefaria.system.exceptions import InputError +from sefaria.utils.util import strip_tags +from .settings import SEARCH_URL, SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET, STATICFILES_DIRS +from sefaria.site.site_settings import SITE_SETTINGS +from sefaria.utils.hebrew import strip_cantillation +import sefaria.model.queue as qu + +es_client = Elasticsearch(SEARCH_URL) +index_client = IndicesClient(es_client) + +tracer = structlog.get_logger(__name__) +tracer.setLevel(logging.CRITICAL) +#tracer.addHandler(logging.FileHandler('/tmp/es_trace.log')) +tracer.addHandler(NullHandler()) + +doc_count = 0 + + +def delete_text(oref, version, lang): + try: + curr_index = get_new_and_current_index_names('text')['current'] + + id = make_text_doc_id(oref.normal(), version, lang) + es_client.delete(index=curr_index, doc_type='text', id=id) + except Exception as e: + logger.error("ERROR deleting {} / {} / {} : {}".format(oref.normal(), version, lang, e)) + + +def delete_version(index, version, lang): + assert isinstance(index, AbstractIndex) + + refs = [] + + if SITE_SETTINGS["TORAH_SPECIFIC"]: + all_gemara_indexes = library.get_indexes_in_category("Bavli") + davidson_indexes = all_gemara_indexes[:all_gemara_indexes.index("Horayot") + 1] + if Ref(index.title).is_bavli() and index.title not in davidson_indexes: + refs += index.all_section_refs() + + refs += index.all_segment_refs() + + for ref in refs: + delete_text(ref, version, lang) + + +def delete_sheet(index_name, id): + try: + es_client.delete(index=index_name, doc_type='sheet', id=id) + except Exception as e: + logger.error("ERROR deleting sheet {}".format(id)) + + +def make_text_doc_id(ref, version, lang): + """ + Returns a doc id string for indexing based on ref, versiona and lang. + + [HACK] Since Elasticsearch chokes on non-ascii ids, hebrew titles are converted + into a number using unicode_number. This mapping should be unique, but actually isn't. + (any tips welcome) + """ + if not version.isascii(): + version = str(unicode_number(version)) + + id = "%s (%s [%s])" % (ref, version, lang) + return id + + +def unicode_number(u): + """ + Returns a number corresponding to the sum value + of each unicode character in u + """ + n = 0 + for i in range(len(u)): + n += ord(u[i]) + return n + + +def index_sheet(index_name, id): + """ + Index source sheet with 'id'. + """ + + sheet = db.sheets.find_one({"id": id}) + if not sheet: return False + + pud = public_user_data(sheet["owner"]) + tag_terms_simple = make_sheet_tags(sheet) + tags = [t["en"] for t in tag_terms_simple] + topics = [] + for t in sheet.get('topics', []): + topic_obj = Topic.init(t['slug']) + if not topic_obj: + continue + topics += [topic_obj] + collections = CollectionSet({"sheets": id, "listed": True}) + collection_names = [c.name for c in collections] + try: + doc = { + "title": strip_tags(sheet["title"]), + "content": make_sheet_text(sheet, pud), + "owner_id": sheet["owner"], + "owner_name": pud["name"], + "owner_image": pud["imageUrl"], + "profile_url": pud["profileUrl"], + "version": "Source Sheet by " + user_link(sheet["owner"]), + "tags": tags, + "topic_slugs": [topic_obj.slug for topic_obj in topics], + "topics_en": [topic_obj.get_primary_title('en') for topic_obj in topics], + "topics_he": [topic_obj.get_primary_title('he') for topic_obj in topics], + "sheetId": id, + "summary": sheet.get("summary", None), + "collections": collection_names, + "datePublished": sheet.get("datePublished", None), + "dateCreated": sheet.get("dateCreated", None), + "dateModified": sheet.get("dateModified", None), + "views": sheet.get("views", 0) + } + es_client.create(index=index_name, doc_type='sheet', id=id, body=doc) + global doc_count + doc_count += 1 + return True + except Exception as e: + print("Error indexing sheet %d" % id) + print(e) + return False + + +def make_sheet_tags(sheet): + def get_primary_title(lang, titles): + return [t for t in titles if t.get("primary") and t.get("lang", "") == lang][0]["text"] + + tags = sheet.get('tags', []) + tag_terms = [(Term().load({'name': t}) or Term().load_by_title(t)) for t in tags] + tag_terms_simple = [ + { + 'en': tags[iterm], # save as en even if it's Hebrew + 'he': '' + } if term is None else + { + 'en': get_primary_title('en', term.titles), + 'he': get_primary_title('he', term.titles) + } for iterm, term in enumerate(tag_terms) + ] + #tags_en, tags_he = zip(*tag_terms_simple.values()) + return tag_terms_simple + +def make_sheet_text(sheet, pud): + """ + Returns a plain text representation of the content of sheet. + :param sheet: The sheet record + :param pud: Public User Database record for the author + """ + text = sheet["title"] + "\n{}".format(sheet.get("summary", '')) + if pud.get("name"): + text += "\nBy: " + pud["name"] + text += "\n" + if sheet.get("tags"): + text += " [" + ", ".join(sheet["tags"]) + "]\n" + for s in sheet["sources"]: + text += source_text(s) + " " + + text = bleach.clean(text, strip=True, tags=()) + + return text + + +def source_text(source): + """ + Recursive function to translate a source dictionary into text. + """ + str_fields = ["customTitle", "ref", "comment", "outsideText"] + dict_fields = ["text", "outsideBiText"] + content = [source.get(field, "") for field in str_fields] + content += [val for field in dict_fields for val in source.get(field, {}).values()] + text = " ".join([strip_tags(c) for c in content]) + + if "subsources" in source: + for s in source["subsources"]: + text += source_text(s) + + return text + + +def get_exact_english_analyzer(): + return { + "tokenizer": "standard", + "char_filter": [ + "icu_normalizer", + ], + "filter": [ + "standard", + "lowercase", + "icu_folding", + ], + } + + +def get_stemmed_english_analyzer(): + stemmed_english_analyzer = get_exact_english_analyzer() + stemmed_english_analyzer['filter'] += ["my_snow"] + return stemmed_english_analyzer + + +def create_index(index_name, type): + """ + Clears the indexes and creates it fresh with the below settings. + """ + clear_index(index_name) + + settings = { + "index": { + "blocks": { + "read_only_allow_delete": False + }, + "analysis": { + "analyzer": { + "stemmed_english": get_stemmed_english_analyzer(), + "exact_english": get_exact_english_analyzer(), + }, + "filter": { + "my_snow": { + "type": "snowball", + "language": "English" + } + } + } + } + } + print('Creating index {}'.format(index_name)) + index_client.create(index=index_name, body=settings) + + if type == 'text': + put_text_mapping(index_name) + elif type == 'sheet': + put_sheet_mapping(index_name) + + +def put_text_mapping(index_name): + """ + Settings mapping for the text document type. + """ + text_mapping = { + 'properties' : { + 'categories': { + 'type': 'keyword', + }, + "category": { + 'type': 'keyword', + }, + "he_category": { + 'type': 'keyword', + }, + "index_title": { + 'type': 'keyword', + }, + "path": { + 'type': 'keyword', + }, + "he_index_title": { + 'type': 'keyword', + }, + "he_path": { + 'type': 'keyword', + }, + "order": { + 'type': 'keyword', + }, + "pagesheetrank": { + 'type': 'double', + 'index': False + }, + "comp_date": { + 'type': 'integer', + 'index': False + }, + "version_priority": { + 'type': 'integer', + 'index': False + }, + "exact": { + 'type': 'text', + 'analyzer': 'exact_english' + }, + "naive_lemmatizer": { + 'type': 'text', + 'analyzer': 'sefaria-naive-lemmatizer', + 'search_analyzer': 'sefaria-naive-lemmatizer-less-prefixes', + 'fields': { + 'exact': { + 'type': 'text', + 'analyzer': 'exact_english' + } + } + } + } + } + index_client.put_mapping(doc_type='text', body=text_mapping, index=index_name) + + +def put_sheet_mapping(index_name): + """ + Sets mapping for the sheets document type. + """ + sheet_mapping = { + 'properties': { + 'owner_name': { + 'type': 'keyword' + }, + 'tags': { + 'type': 'keyword' + }, + "topics_en": { + "type": "keyword" + }, + "topics_he": { + "type": "keyword" + }, + "topic_slugs": { + "type": "keyword" + }, + 'owner_image': { + 'type': 'keyword' + }, + 'datePublished': { + 'type': 'date' + }, + 'dateCreated': { + 'type': 'date' + }, + 'dateModified': { + 'type': 'date' + }, + 'sheetId': { + 'type': 'integer' + }, + 'collections': { + 'type': 'keyword' + }, + 'title': { + 'type': 'keyword' + }, + 'views': { + 'type': 'integer' + }, + 'summary': { + 'type': 'keyword' + }, + 'content': { + 'type': 'text', + 'analyzer': 'stemmed_english' + }, + 'version': { + 'type': 'keyword' + }, + 'profile_url': { + 'type': 'keyword' + }, + 'owner_id': { + 'type': 'integer' + } + } + } + index_client.put_mapping(doc_type='sheet', body=sheet_mapping, index=index_name) + +def get_search_categories(oref, categories): + toc_tree = library.get_toc_tree() + cats = oref.index.categories + + indexed_categories = categories # the default + + # get the full path of every cat along the way. + # starting w/ the longest, + # check if they're root swapped. + paths = [cats[:i] for i in range(len(cats), 0, -1)] + for path in paths: + cnode = toc_tree.lookup(path) + if getattr(cnode, "searchRoot", None) is not None: + # Use the specified searchRoot, with the rest of the category path appended. + indexed_categories = [cnode.searchRoot] + cats[len(path) - 1:] + break + return indexed_categories + + +class TextIndexer(object): + + @classmethod + def clear_cache(cls): + cls.terms_dict = None + cls.version_priority_map = None + cls._bulk_actions = None + cls.best_time_period = None + + + @classmethod + def create_terms_dict(cls): + cls.terms_dict = {} + ts = TermSet() + for t in ts: + cls.terms_dict[t.name] = t.contents() + + @classmethod + def create_version_priority_map(cls): + toc = library.get_toc() + cls.version_priority_map = {} + + def traverse(mini_toc): + if type(mini_toc) == list: + for t in mini_toc: + traverse(t) + elif "contents" in mini_toc: + for t in mini_toc["contents"]: + traverse(t) + elif "title" in mini_toc and not mini_toc.get("isCollection", False): + title = mini_toc["title"] + try: + r = Ref(title) + except InputError: + print("Failed to parse ref, {}".format(title)) + return + vlist = cls.get_ref_version_list(r) + vpriorities = defaultdict(lambda: 0) + for i, v in enumerate(vlist): + lang = v.language + cls.version_priority_map[(title, v.versionTitle, lang)] = (vpriorities[lang], mini_toc["categories"]) + vpriorities[lang] += 1 + + traverse(toc) + + @staticmethod + def get_ref_version_list(oref, tries=0): + try: + return oref.index.versionSet().array() + except InputError as e: + print(f"InputError: {oref.normal()}") + return [] + except pymongo.errors.AutoReconnect as e: + if tries < 200: + pytime.sleep(5) + return TextIndexer.get_ref_version_list(oref, tries+1) + else: + print("get_ref_version_list -- Tried: {} times. Failed :(".format(tries)) + raise e + + @classmethod + def get_all_versions(cls, tries=0, versions=None, page=0): + versions = versions or [] + try: + version_limit = 10 + temp_versions = [] + first_run = True + while first_run or len(temp_versions) > 0: + temp_versions = VersionSet(limit=version_limit, page=page).array() + versions += temp_versions + page += 1 + first_run = False + return versions + except pymongo.errors.AutoReconnect as e: + if tries < 200: + pytime.sleep(5) + return cls.get_all_versions(tries+1, versions, page) + else: + print("Tried: {} times. Got {} versions".format(tries, len(versions))) + raise e + + @classmethod + def index_all(cls, index_name, debug=False, for_es=True, action=None): + cls.index_name = index_name + cls.create_version_priority_map() + cls.create_terms_dict() + Ref.clear_cache() # try to clear Ref cache to save RAM + + versions = sorted([x for x in cls.get_all_versions() if (x.title, x.versionTitle, x.language) in cls.version_priority_map], key=lambda x: cls.version_priority_map[(x.title, x.versionTitle, x.language)][0]) + versions_by_index = {} + # organizing by index for the merged case. There is no longer a merged case but keeping this logic b/c it seems fine + for v in versions: + key = (v.title, v.language) + if key in versions_by_index: + versions_by_index[key] += [v] + else: + versions_by_index[key] = [v] + print("Beginning index of {} versions.".format(len(versions))) + vcount = 0 + total_versions = len(versions) + versions = None # release RAM + for title, vlist in list(versions_by_index.items()): + cls.curr_index = vlist[0].get_index() if len(vlist) > 0 else None + if for_es: + cls._bulk_actions = [] + try: + cls.best_time_period = cls.curr_index.best_time_period() + except ValueError: + cls.best_time_period = None + for v in vlist: + if v.versionTitle == "Yehoyesh's Yiddish Tanakh Translation [yi]": + print("skipping yiddish. we don't like yiddish") + continue + + cls.index_version(v, action=action) + print("Indexed Version {}/{}".format(vcount, total_versions)) + vcount += 1 + if for_es: + bulk(es_client, cls._bulk_actions, stats_only=True, raise_on_error=False) + + @classmethod + def index_version(cls, version, tries=0, action=None): + if not action: + action = cls._cache_action + try: + version.walk_thru_contents(action, heTref=cls.curr_index.get_title('he'), schema=cls.curr_index.schema, terms_dict=cls.terms_dict) + except pymongo.errors.AutoReconnect as e: + # Adding this because there is a mongo call for dictionary words in walk_thru_contents() + if tries < 200: + pytime.sleep(5) + print("Retrying {}. Try {}".format(version.title, tries)) + cls.index_version(version, tries+1) + else: + print("Tried {} times to get {}. I have failed you...".format(tries, version.title)) + raise e + except StopIteration: + print("Could not find dictionary node in {}".format(version.title)) + + @classmethod + def index_ref(cls, index_name, oref, version_title, lang): + # slower than `cls.index_version` but useful when you don't want the overhead of loading all versions into cache + cls.index_name = index_name + cls.curr_index = oref.index + try: + cls.best_time_period = cls.curr_index.best_time_period() + except ValueError: + cls.best_time_period = None + version_priority = 0 + hebrew_version_title = None + for priority, v in enumerate(cls.get_ref_version_list(oref)): + if v.versionTitle == version_title: + version_priority = priority + hebrew_version_title = getattr(v, 'versionTitleInHebrew', None) + content = TextChunk(oref, lang, vtitle=version_title).ja().flatten_to_string() + categories = cls.curr_index.categories + tref = oref.normal() + doc = cls.make_text_index_document(tref, oref.he_normal(), version_title, lang, version_priority, content, categories, hebrew_version_title) + id = make_text_doc_id(tref, version_title, lang) + es_client.index(index_name, doc, id=id) + + @classmethod + def _cache_action(cls, segment_str, tref, heTref, version): + # Index this document as a whole + vtitle = version.versionTitle + vlang = version.language + hebrew_version_title = getattr(version, 'versionTitleInHebrew', None) + try: + version_priority, categories = cls.version_priority_map[(version.title, vtitle, vlang)] + #TODO include sgement_str in this func + doc = cls.make_text_index_document(tref, heTref, vtitle, vlang, version_priority, segment_str, categories, hebrew_version_title) + # print doc + except Exception as e: + logger.error("Error making index document {} / {} / {} : {}".format(tref, vtitle, vlang, str(e))) + return + + if doc: + try: + cls._bulk_actions += [ + { + "_index": cls.index_name, + "_type": "text", + "_id": make_text_doc_id(tref, vtitle, vlang), + "_source": doc + } + ] + except Exception as e: + logger.error("ERROR indexing {} / {} / {} : {}".format(tref, vtitle, vlang, e)) + + @classmethod + def remove_footnotes(cls, content): + ftnotes = AbstractTextRecord.find_all_itags(content, only_footnotes=True)[1] + if len(ftnotes) == 0: + return content + else: + for sup_tag in ftnotes: + i_tag = sup_tag.next_sibling + content += f" {sup_tag.text} {i_tag.text}" + content = AbstractTextRecord.strip_itags(content) + return content + + @classmethod + def modify_text_in_doc(cls, content): + content = AbstractTextRecord.strip_imgs(content) + content = cls.remove_footnotes(content) + content = strip_cantillation(content, strip_vowels=False).strip() + content = re.sub(r'<[^>]+>', ' ', content) # replace HTML tags with space so that words dont get smushed together + content = re.sub(r'\([^)]+\)', ' ', content) # remove all parens + while " " in content: # make sure there are not many spaces in a row + content = content.replace(" ", " ") + return content + + @classmethod + def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories, hebrew_version_title): + """ + Create a document for indexing from the text specified by ref/version/lang + """ + # Don't bother indexing if there's no content + if not content: + return False + content = cls.modify_text_in_doc(content) + if len(content) == 0: + return False + + oref = Ref(tref) + + indexed_categories = get_search_categories(oref, categories) + + tp = cls.best_time_period + if tp is not None: + comp_start_date = int(tp.start) + else: + comp_start_date = 3000 # far in the future + + ref_data = RefData().load({"ref": tref}) + pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK + + return { + "ref": tref, + "heRef": heTref, + "version": version, + "lang": lang, + "version_priority": version_priority if version_priority is not None else 1000, + "titleVariants": oref.index_node.all_tree_titles("en"), + "categories": indexed_categories, + "order": oref.order_id(), + "path": "/".join(indexed_categories + [cls.curr_index.title]), + "pagesheetrank": pagesheetrank, + "comp_date": comp_start_date, + #"hebmorph_semi_exact": content, + "exact": content, + "naive_lemmatizer": content, + 'hebrew_version_title': hebrew_version_title, + } + + +def index_sheets_by_timestamp(timestamp): + """ + :param timestamp str: index all sheets modified after `timestamp` (in isoformat) + """ + + name_dict = get_new_and_current_index_names('sheet', debug=False) + curr_index_name = name_dict['current'] + try: + ids = db.sheets.find({"status": "public", "dateModified": {"$gt": timestamp}}).distinct("id") + except Exception as e: + print(e) + return str(e) + + succeeded = [] + failed = [] + + for id in ids: + did_succeed = index_sheet(curr_index_name, id) + if did_succeed: + succeeded += [id] + else: + failed += [id] + + return {"succeeded": {"num": len(succeeded), "ids": succeeded}, "failed": {"num": len(failed), "ids": failed}} + + +def index_public_sheets(index_name): + """ + Index all source sheets that are publicly listed. + """ + ids = db.sheets.find({"status": "public"}).distinct("id") + for id in ids: + index_sheet(index_name, id) + + +def index_public_notes(): + """ + Index all public notes. + + TODO + """ + pass + + +def clear_index(index_name): + """ + Delete the search index. + """ + try: + index_client.delete(index=index_name) + except Exception as e: + print("Error deleting Elasticsearch Index named %s" % index_name) + print(e) + + +def add_ref_to_index_queue(ref, version, lang): + """ + Adds a text to index queue to be indexed later. + """ + qu.IndexQueue({ + "ref": ref, + "lang": lang, + "version": version, + "type": "ref", + }).save() + + return True + + +def index_from_queue(): + """ + Index every ref/version/lang found in the index queue. + Delete queue records on success. + """ + index_name = get_new_and_current_index_names('text')['current'] + queue = db.index_queue.find() + for item in queue: + try: + TextIndexer.index_ref(index_name, Ref(item["ref"]), item["version"], item["lang"], False) + db.index_queue.remove(item) + except Exception as e: + logging.error("Error indexing from queue ({} / {} / {}) : {}".format(item["ref"], item["version"], item["lang"], e)) + + +def add_recent_to_queue(ndays): + """ + Look through the last ndays of the activitiy log, + add to the index queue any refs that had their text altered. + """ + cutoff = datetime.now() - timedelta(days=ndays) + query = { + "date": {"$gt": cutoff}, + "rev_type": {"$in": ["add text", "edit text"]} + } + activity = db.history.find(query) + refs = set() + for a in activity: + refs.add((a["ref"], a["version"], a["language"])) + for ref in list(refs): + add_ref_to_index_queue(ref[0], ref[1], ref[2]) + + +def get_new_and_current_index_names(type, debug=False): + base_index_name_dict = { + 'text': SEARCH_INDEX_NAME_TEXT, + 'sheet': SEARCH_INDEX_NAME_SHEET, + } + index_name_a = "{}-a{}".format(base_index_name_dict[type], '-debug' if debug else '') + index_name_b = "{}-b{}".format(base_index_name_dict[type], '-debug' if debug else '') + alias_name = "{}{}".format(base_index_name_dict[type], '-debug' if debug else '') + aliases = index_client.get_alias() + try: + a_alias = aliases[index_name_a]['aliases'] + choose_a = alias_name not in a_alias + except KeyError: + choose_a = True + + if choose_a: + new_index_name = index_name_a + old_index_name = index_name_b + else: + new_index_name = index_name_b + old_index_name = index_name_a + return {"new": new_index_name, "current": old_index_name, "alias": alias_name} + + +def index_all(skip=0, debug=False): + """ + Fully create the search index from scratch. + """ + start = datetime.now() + index_all_of_type('text', skip=skip, debug=debug) + index_all_of_type('sheet', skip=skip, debug=debug) + end = datetime.now() + db.index_queue.delete_many({}) # index queue is now stale + print("Elapsed time: %s" % str(end-start)) + + +def index_all_of_type(type, skip=0, debug=False): + index_names_dict = get_new_and_current_index_names(type=type, debug=debug) + print('CREATING / DELETING {}'.format(index_names_dict['new'])) + print('CURRENT {}'.format(index_names_dict['current'])) + for i in range(10): + print('STARTING IN T-MINUS {}'.format(10 - i)) + pytime.sleep(1) + + index_all_of_type_by_index_name(type, index_names_dict['new'], skip, debug) + + try: + #index_client.put_settings(index=index_names_dict['current'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) + index_client.delete_alias(index=index_names_dict['current'], name=index_names_dict['alias']) + print("Successfully deleted alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) + except NotFoundError: + print("Failed to delete alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) + + clear_index(index_names_dict['alias']) # make sure there are no indexes with the alias_name + + #index_client.put_settings(index=index_names_dict['new'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) + index_client.put_alias(index=index_names_dict['new'], name=index_names_dict['alias']) + + if index_names_dict['new'] != index_names_dict['current']: + clear_index(index_names_dict['current']) + + +def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False): + if skip == 0: + create_index(index_name, type) + if type == 'text': + TextIndexer.clear_cache() + TextIndexer.index_all(index_name, debug=debug) + elif type == 'sheet': + index_public_sheets(index_name) \ No newline at end of file diff --git a/sefaria/urls.py b/sefaria/urls.py index 8ab6a546b8..5952471ec5 100644 --- a/sefaria/urls.py +++ b/sefaria/urls.py @@ -238,7 +238,9 @@ # Search API urlpatterns += [ url(r'^api/dummy-search$', reader_views.dummy_search_api), - url(r'^api/search-wrapper$', reader_views.search_wrapper_api), + url(r'^api/search-wrapper/es6$', reader_views.search_wrapper_api, {'es6_compat': True}), + url(r'^api/search-wrapper/es8$', reader_views.search_wrapper_api), + url(r'^api/search-wrapper$', reader_views.search_wrapper_api, {'es6_compat': True}), url(r'^api/search-path-filter/(?P.+)$', reader_views.search_path_filter), ] diff --git a/sites/s4d/site_settings.py b/sites/s4d/site_settings.py index 61af8782b4..5005a7f15d 100644 --- a/sites/s4d/site_settings.py +++ b/sites/s4d/site_settings.py @@ -18,5 +18,6 @@ "COLLECTIONS_BUCKET": "jmc-collection-images", "PROFILES_BUCKET": 'jmc-profile-pictures', "UGC_BUCKET": 'jmc-sheet-user-uploaded-media', - "DONATION_URL": "https://jackmillercenter.org/support-us/" + "DONATION_URL": "https://jackmillercenter.org/support-us/", + "TOPICS_BUCKET": 'topicimages' } diff --git a/static/css/s2.css b/static/css/s2.css index 96bacdb519..98aeef3096 100644 --- a/static/css/s2.css +++ b/static/css/s2.css @@ -579,7 +579,7 @@ input.noselect { text-align: inherit; background: #EDEDEC; border-radius: 250px; - width: 140px; + width: 160px; height: 30px; } .header .searchBox.searchFocused { diff --git a/static/js/Header.jsx b/static/js/Header.jsx index 23ca82ff41..5b468b1a4b 100644 --- a/static/js/Header.jsx +++ b/static/js/Header.jsx @@ -381,7 +381,7 @@ class SearchBar extends Component { { const [bannerShowDelayHasElapsed, setBannerShowDelayHasElapsed] = useState(false); const [hasInteractedWithBanner, setHasInteractedWithBanner] = useState(false); - const strapi = useContext(StrapiDataContext); + let strapi = {}; + strapi.banner = { + "internalBannerName": "contextus-welcome", + "bannerText": {"en": "Welcome to ContextUS! We are in the process of upgrading and expanding our site. Please excuse our dust! If you have any questions or feedback, please contact us at jmc@gojmc.org.", + "he": "Welcome to ContextUS! We are in the process of upgrading and expanding our site. Please excuse our dust! If you have any questions or feedback, please contact us at jmc@gojmc.org."}, + "buttonText": {"en": "Contact Us", "he": "Contact Us"}, + "buttonURL": {"en": "mailto:jmc@gojmc.org", "he": "mailto:jmc@gojmc.org"}, + "showDelay": 2, + "bannerBackgroundColor": '#133059', + "locale": "en", + "localizations": { "data": [] }, + "publishedAt": "2023-12-05T23:18:42.245Z", + "shouldDeployOnMobile": true, + "showToNewVisitors": true, + "showToNonSustainers": true, + "showToReturningVisitors": true, + "showToSustainers": true, + } const markBannerAsHasBeenInteractedWith = (bannerName) => { localStorage.setItem("banner_" + bannerName, "true"); @@ -2288,7 +2305,7 @@ const Banner = ({ onClose }) => { }, strapi.banner.showDelay * 1000); return () => clearTimeout(timeoutId); // clearTimeout on component unmount } - }, [strapi.banner]); // execute useEffect when the banner changes + }, []); // execute useEffect when the banner changes if (!bannerShowDelayHasElapsed) return null; @@ -2307,9 +2324,9 @@ const Banner = ({ onClose }) => {
diff --git a/static/js/SearchPage.jsx b/static/js/SearchPage.jsx index 27445215a0..438b2bc3c9 100644 --- a/static/js/SearchPage.jsx +++ b/static/js/SearchPage.jsx @@ -46,9 +46,9 @@ class SearchPage extends Component { { this.props.query } - {this.state.totalResults ? + {this.state.totalResults?.getValue() > 0 ?
- {this.state.totalResults.toLocaleString()}  + {this.state.totalResults.asString()}  Results
: null } @@ -71,11 +71,11 @@ class SearchPage extends Component { {(Sefaria.multiPanel && !this.props.compare) || this.state.mobileFiltersOpen ?
- {this.state.totalResults ? + {this.state.totalResults?.getValue() > 0 ? this.setState({mobileFiltersOpen: false})} diff --git a/static/js/SearchResultList.jsx b/static/js/SearchResultList.jsx index 3ba7626386..adda284d77 100644 --- a/static/js/SearchResultList.jsx +++ b/static/js/SearchResultList.jsx @@ -76,6 +76,36 @@ const SearchTopic = (props) => { } +class SearchTotal { + constructor({value=0, relation="eq"} = {}) { + this._value = value; + this._relation = relation; + } + getValue = () => this._value; + add = (num) => this._value += num; + asString = () => `${this._value.addCommas()}${this._getRelationString()}`; + _getRelationString = () => this._relation === 'gte' ? '+' : ''; + combine = (other) => { + if (!(other instanceof SearchTotal)) { + throw new TypeError('Parameter must be an instance of SearchTotal.'); + } + const newValue = this.getValue() + other.getValue(); + let newRelation = this._relation; + if (other._relation === 'gte' || this._relation === 'gte') { + newRelation = 'gte'; + } + return new SearchTotal({value: newValue, relation: newRelation}); + }; +} + + +function createSearchTotal(total) { + /** + * this function ensures backwards compatibility between the way elasticsearch formats the total pre-v8 and post-v8 + */ + const totalObj = typeof(total) === 'number' ? {value: total} : {value: total.value, relation: total.relation}; + return new SearchTotal(totalObj) +} class SearchResultList extends Component { @@ -87,7 +117,7 @@ class SearchResultList extends Component { runningQueries: this._typeObjDefault(null), isQueryRunning: this._typeObjDefault(false), moreToLoad: this._typeObjDefault(true), - totals: this._typeObjDefault(0), + totals: this._typeObjDefault(new SearchTotal()), pagesLoaded: this._typeObjDefault(0), hits: this._typeObjDefault([]), error: false, @@ -104,7 +134,7 @@ class SearchResultList extends Component { //console.log("Loaded cached query for") //console.log(args); this.state.hits[t] = this.state.hits[t].concat(cachedQuery.hits.hits); - this.state.totals[t] = cachedQuery.hits.total; + this.state.totals[t] = createSearchTotal(cachedQuery.hits.total); this.state.pagesLoaded[t] += 1; args.start = this.state.pagesLoaded[t] * this.querySize[t]; if (t === "text") { @@ -127,7 +157,7 @@ class SearchResultList extends Component { componentWillReceiveProps(newProps) { if(this.props.query !== newProps.query) { this.setState({ - totals: this._typeObjDefault(0), + totals: this._typeObjDefault(new SearchTotal()), hits: this._typeObjDefault([]), moreToLoad: this._typeObjDefault(true), }); @@ -245,7 +275,7 @@ class SearchResultList extends Component { this.setState(this.state); } totalResults() { - return this.types.reduce((accum, type) => (this.state.totals[type].value + accum), 0); + return this.types.reduce((accum, type) => (this.state.totals[type].combine(accum)), new SearchTotal()); } updateTotalResults() { this.props.updateTotalResults(this.totalResults()); @@ -324,11 +354,12 @@ class SearchResultList extends Component { args.success = data => { this.updateRunningQuery(type, null); if (this.state.pagesLoaded[type] === 0) { // Skip if pages have already been loaded from cache, but let aggregation processing below occur + const currTotal = createSearchTotal(data.hits.total); let state = { hits: extend(this.state.hits, {[type]: data.hits.hits}), - totals: extend(this.state.totals, {[type]: data.hits.total}), + totals: extend(this.state.totals, {[type]: currTotal}), pagesLoaded: extend(this.state.pagesLoaded, {[type]: 1}), - moreToLoad: extend(this.state.moreToLoad, {[type]: data.hits.total > this.querySize[type]}) + moreToLoad: extend(this.state.moreToLoad, {[type]: currTotal.getValue() > this.querySize[type]}) }; this.setState(state, () => { this.updateTotalResults(); @@ -336,7 +367,7 @@ class SearchResultList extends Component { }); const filter_label = (request_applied && request_applied.length > 0) ? (' - ' + request_applied.join('|')) : ''; const query_label = props.query + filter_label; - Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, data.hits.total); + Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, createSearchTotal(data.hits.total).getValue()); } if (data.aggregations) { @@ -395,7 +426,7 @@ class SearchResultList extends Component { this.state.hits[type] = nextHits; this.state.pagesLoaded[type] += 1; - if (this.state.pagesLoaded[type] * this.querySize[type] >= this.state.totals[type] ) { + if (this.state.pagesLoaded[type] * this.querySize[type] >= this.state.totals[type].getValue() ) { this.state.moreToLoad[type] = false; } @@ -479,8 +510,8 @@ class SearchResultList extends Component { : null } {Sefaria.multiPanel && !this.props.compare ? @@ -522,15 +553,13 @@ const SearchTabs = ({clickTextButton, clickSheetButton, textTotal, sheetTotal, c const SearchTab = ({label, total, onClick, active}) => { - total = total ? total.toLocaleString() : 0; - const classes = classNames({"search-dropdown-button": 1, active}); return (
{e.charCode === 13 ? onClick(e) : null}} role="button" tabIndex="0">
{label}  - {`(${total})`} + {`(${total.asString()})`}
);