Merge pull request #39 from Sefaria/dev

Dev
Sefaria · Feb 14, 2024 · f037f1f · f037f1f
2 parents e5aab5d + df85412
commit f037f1f
Show file tree

Hide file tree

Showing 21 changed files with 80 additions and 1,049 deletions.
diff --git a/api/views.py b/api/views.py
@@ -48,7 +48,7 @@ def get(self, request, *args, **kwargs):
         if not versions_params:
             versions_params = ['primary']
         versions_params = [self.split_piped_params(param_str) for param_str in versions_params]
-        fill_in_missing_segments = request.GET.get('fill_in_missing_segments', False)
+        fill_in_missing_segments = bool(int(request.GET.get('fill_in_missing_segments', False)))
         return_format = request.GET.get('return_format', 'default')
         if return_format not in self.RETURN_FORMATS:
             return jsonResponse({'error': f'return_format should be one of those formats: {self.RETURN_FORMATS}.'}, status=400)

diff --git a/build/ci/production-values.yaml b/build/ci/production-values.yaml
@@ -150,7 +150,6 @@ cronJobs:
     enabled: false
   reindexElasticSearch:
     enabled: true
-    SEARCH_HOST_ES8: "elasticsearch-8-es-default.elasticsearch.svc"
   topicsIndexing:
     enabled: true
   trello:

diff --git a/cli.py b/cli.py
@@ -1,5 +1,5 @@
 import django
-django.setup() # comment for sake of PR
+django.setup()
 
 from sefaria.model import *
 import sefaria.system.database as database
diff --git a/helm-chart/sefaria-project/templates/configmap/nginx.yaml b/helm-chart/sefaria-project/templates/configmap/nginx.yaml
@@ -117,7 +117,6 @@ data:
         # allow urls which aren't caught by regex above
         location /api/search/ {
           rewrite ^/(?:api/search)/(.*)$ /$1 break;
-          proxy_set_header Content-Type application/json;  # es 6.0 requires this header
           proxy_set_header Authorization "Basic ${ELASTIC_AUTH_HEADER}";
           add_header 'Access-Control-Allow-Origin' '';
           proxy_pass http://elasticsearch_upstream/;

diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml
@@ -1,77 +0,0 @@
-{{- if .Values.cronJobs.reindexElasticSearch.enabled }}
----
-apiVersion: batch/v1
-kind: CronJob
-metadata:
-  name: {{ .Values.deployEnv }}-reindex-elastic-search-es6
-  labels:
-    {{- include "sefaria.labels" . | nindent 4 }}
-spec:
-  schedule: "20 13 * * 0"
-  jobTemplate:
-    spec:
-      backoffLimit: 1
-      template:
-        spec:
-          affinity:
-            podAntiAffinity:
-              requiredDuringSchedulingIgnoredDuringExecution:
-              - labelSelector:
-                  matchExpressions:
-                  - key: app
-                    operator: In
-                    values:
-                    - mongo
-                topologyKey: kubernetes.io.hostname
-          containers:
-          - name: reindex-elastic-search-es6
-            image: "{{ .Values.web.containerImage.imageRegistry }}:{{ .Values.web.containerImage.tag }}"
-            resources:
-              limits:
-                memory: 9Gi
-              requests:
-                memory: 7Gi
-            env:
-            - name: SEARCH_HOST
-              value: "{{ .Values.cronJobs.reindexElasticSearch.SEARCH_HOST_ES6 }}"
-            - name: REDIS_HOST
-              value: "redis-{{ .Values.deployEnv }}"
-            - name: NODEJS_HOST
-              value: "node-{{ .Values.deployEnv }}-{{ .Release.Revision }}"
-            - name: VARNISH_HOST
-              value: "varnish-{{ .Values.deployEnv }}-{{ .Release.Revision }}"
-            - name: SLACK_URL
-              valueFrom:
-                secretKeyRef:
-                  name: {{ template "sefaria.secrets.slackWebhook" . }}
-                  key: slack-webhook
-            envFrom:
-            - secretRef:
-                name: {{ .Values.secrets.localSettings.ref }}
-                optional: true
-            - configMapRef:
-                name: local-settings-{{ .Values.deployEnv }}
-            - secretRef:
-                name: local-settings-secrets-{{ .Values.deployEnv }}
-                optional: true
-            volumeMounts:
-              - mountPath: /app/sefaria/local_settings.py
-                name: local-settings
-                subPath: local_settings.py
-                readOnly: true
-            command: ["bash"]
-            args: [
-              "-c",
-              "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/scheduled/reindex_elasticsearch_cronjob_ES6.py"
-            ]
-          restartPolicy: Never
-          volumes:
-          - name: local-settings
-            configMap:
-              name: local-settings-file-{{ .Values.deployEnv }}
-              items:
-                - key: local_settings.py
-                  path: local_settings.py
-  successfulJobsHistoryLimit: 1
-  failedJobsHistoryLimit: 2
-{{- end }}

diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml
@@ -33,7 +33,7 @@ spec:
                 memory: 7Gi
             env:
             - name: SEARCH_HOST
-              value: "{{ .Values.cronJobs.reindexElasticSearch.SEARCH_HOST_ES8 }}"
+              value: "{{ .Values.nginx.SEARCH_HOST }}"
             - name: REDIS_HOST
               value: "redis-{{ .Values.deployEnv }}"
             - name: NODEJS_HOST
@@ -64,7 +64,7 @@ spec:
             command: ["bash"]
             args: [
               "-c",
-              "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy elasticsearch==8.8.2 git+https://github.com/Sefaria/[email protected]#egg=elasticsearch-dsl && /app/run /app/scripts/scheduled/reindex_elasticsearch_cronjob.py"
+              "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/scheduled/reindex_elasticsearch_cronjob.py"
             ]
           restartPolicy: Never
           volumes:

diff --git a/reader/views.py b/reader/views.py
@@ -1653,10 +1653,10 @@ def index_api(request, title, raw=False):
     API for manipulating text index records (aka "Text Info")
     """
     if request.method == "GET":
-        with_content_counts = bool(request.GET.get("with_content_counts", False))
+        with_content_counts = bool(int(request.GET.get("with_content_counts", False)))
         i = library.get_index(title).contents(raw=raw, with_content_counts=with_content_counts)
 
-        if request.GET.get("with_related_topics", False):
+        if bool(int(request.GET.get("with_related_topics", False))):
             i["relatedTopics"] = get_topics_for_book(title, annotate=True)
 
         return jsonResponse(i, callback=request.GET.get("callback", None))
@@ -1870,7 +1870,7 @@ def _collapse_book_leaf_shapes(leaf_shapes):
         else:
             cat_list = title.split("/")
             depth = request.GET.get("depth", 2)
-            include_dependents = request.GET.get("dependents", False)
+            include_dependents = bool(int(request.GET.get("dependents", False)))
             indexes = []
             if len(cat_list) == 1:
                 # try as corpus
@@ -2075,7 +2075,7 @@ def notes_api(request, note_id_or_ref):
             raise Http404
         oref = Ref(note_id_or_ref)
         cb = request.GET.get("callback", None)
-        private = request.GET.get("private", False)
+        private = bool(int(request.GET.get("private", False)))
         res = get_notes(oref, uid=creds["user_id"], public=(not private))
         return jsonResponse(res, cb)
 
@@ -2149,7 +2149,7 @@ def protected_note_post(req):
 @catch_error_as_json
 def all_notes_api(request):
 
-    private = request.GET.get("private", False)
+    private = bool(int(request.GET.get("private", False)))
     if private:
         if not request.user.is_authenticated:
             res = {"error": "You must be logged in to access you notes."}
@@ -2165,17 +2165,17 @@ def related_api(request, tref):
     """
     Single API to bundle available content related to `tref`.
     """
-    if request.GET.get("private", False) and request.user.is_authenticated:
+    if bool(int(request.GET.get("private", False))) and request.user.is_authenticated:
         oref = Ref(tref)
         response = {
             "sheets": get_sheets_for_ref(tref, uid=request.user.id),
             "notes": get_notes(oref, uid=request.user.id, public=False)
         }
-    elif request.GET.get("private", False) and not request.user.is_authenticated:
+    elif bool(int(request.GET.get("private", False))) and not request.user.is_authenticated:
         response = {"error": "You must be logged in to access private content."}
     else:
         response = {
-            "links": get_links(tref, with_text=False, with_sheet_links=request.GET.get("with_sheet_links", False)),
+            "links": get_links(tref, with_text=False, with_sheet_links=bool(int(request.GET.get("with_sheet_links", False)))),
             "sheets": get_sheets_for_ref(tref),
             "notes": [],  # get_notes(oref, public=True) # Hiding public notes for now
             "webpages": get_webpages_for_ref(tref),
@@ -2668,7 +2668,7 @@ def name_api(request, name):
     name = name[1:] if topic_override else name
     # Number of results to return.  0 indicates no limit
     LIMIT = int(request.GET.get("limit", 10))
-    ref_only = request.GET.get("ref_only", False)
+    ref_only = bool(int(request.GET.get("ref_only", False)))
     completions_dict = get_name_completions(name, LIMIT, ref_only, topic_override)
     ref = completions_dict["ref"]
     topic = completions_dict["topic"]
@@ -2772,7 +2772,7 @@ def user_stats_api(request, uid):
     assert request.method == "GET", "Unsupported Method"
     u = request.user
     assert (u.is_active and u.is_staff) or (int(uid) == u.id)
-    quick = bool(request.GET.get("quick", False))
+    quick = bool(int(request.GET.get("quick", False)))
     if quick:
         return jsonResponse(public_user_data(uid))
     return jsonResponse(user_stats_data(uid))
@@ -4272,7 +4272,7 @@ def search_wrapper_api(request, es6_compat=False):
         search_obj = get_query_obj(search_obj=search_obj, **j)
         response = search_obj.execute()
         if response.success():
-            response_json = getattr(response.to_dict(), 'body', response.to_dict())
+            response_json = response.to_dict().body
             if es6_compat and isinstance(response_json['hits']['total'], dict):
                 response_json['hits']['total'] = response_json['hits']['total']['value']
             return jsonResponse(response_json, callback=request.GET.get("callback", None))
@@ -4618,7 +4618,7 @@ def isNodeJsReachable():
         except Exception as e:
             logger.warn(f"Failed node healthcheck. Error: {e}")
             return False
-        
+
     def is_database_reachable():
         try:
             from sefaria.system.database import db

diff --git a/requirements.txt b/requirements.txt
@@ -22,8 +22,8 @@ django==1.11.*
 djangorestframework @ https://github.com/encode/django-rest-framework/archive/3.11.1.tar.gz
 djangorestframework_simplejwt==3.3.0
 PyJWT==1.7.1 # pinned b/c current version 2.0.0 breaks simplejwt. waiting for 2.0.1
-elasticsearch==7.17.*
-elasticsearch_dsl==7.4.*
+elasticsearch==8.8.2
+git+https://github.com/Sefaria/[email protected]#egg=elasticsearch-dsl
 geojson==2.5.0
 geopy==2.3.0
 gevent==20.12.0; sys_platform != 'darwin'

diff --git a/scripts/scheduled/reindex_elasticsearch_cronjob_ES6.py b/scripts/scheduled/reindex_elasticsearch_cronjob_ES6.py
@@ -1,49 +0,0 @@
-"""
-This file is meant to be temporary while we are migrating to elasticsearch 8
-"""
-from datetime import datetime
-import requests
-import traceback
-import os
-import django
-django.setup()
-from sefaria.model import *
-from sefaria.search_ES6 import index_all
-from sefaria.local_settings import SEFARIA_BOT_API_KEY
-from sefaria.pagesheetrank import update_pagesheetrank
-
-"""
-Source sheets added after last_sheet_timestamp will be missing from the index process. We want to manually index all
-source sheets created after this. Depending on the database being used to index the timestamp will be different. If
-running against a production database, last_sheet_timestamp will be the time this script began running. Otherwise, this
-value will need to be set to the time at which the last mongo dump was created (assuming the database is using the most
-up-to-date mongo dump).
-"""
-# last_sheet_timestamp = datetime.fromtimestamp(os.path.getmtime("/var/data/sefaria_public/dump/sefaria")).isoformat()
-try:
-    last_sheet_timestamp = datetime.now().isoformat()
-    update_pagesheetrank()
-    index_all()
-    r = requests.post("https://www.sefaria.org/admin/index-sheets-by-timestamp", data={"timestamp": last_sheet_timestamp, "apikey": SEFARIA_BOT_API_KEY})
-    if "error" in r.text:
-        raise Exception("Error when calling admin/index-sheets-by-timestamp API: " + r.text)
-    else:
-        print("SUCCESS!", r.text)
-except Exception as e:
-    tb_str = traceback.format_exc()
-    print("Caught exception")
-    post_object = {
-        "icon_emoji": ":facepalm:",
-        "username": "Reindex ElasticSearch",
-        "channel": "#engineering-discuss",
-        "attachments": [
-            {
-                "fallback": tb_str,
-                "color": "#a30200",
-                "pretext": "Cronjob Error",
-                "text": tb_str
-            }
-        ]
-    }
-    requests.post(os.environ['SLACK_URL'], json=post_object)
-    raise e

diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py
@@ -52,8 +52,8 @@ class _FindRefsTextOptions:
     @attr version_preferences_by_corpus: dict of dicts of the form { <corpus>: { <lang>: <vtitle> }}
     """
 
-    debug: bool = False
     with_text: bool = False
+    debug: bool = False
     max_segments: int = 0
     version_preferences_by_corpus: dict = None
 

diff --git a/sefaria/helper/tests/linker_test.py b/sefaria/helper/tests/linker_test.py
@@ -131,8 +131,8 @@ def test_find_refs_text(self, mock_is_hebrew: Mock):
         assert find_refs_text.lang == 'en'
 
     def test_find_refs_text_options(self):
-        find_refs_text_options = linker._FindRefsTextOptions(True, True, 10, {})
-        assert find_refs_text_options.debug
+        find_refs_text_options = linker._FindRefsTextOptions(True, False, 10, {})
+        assert not find_refs_text_options.debug
         assert find_refs_text_options.with_text
         assert find_refs_text_options.max_segments == 10
         assert find_refs_text_options.version_preferences_by_corpus == {}