Merge pull request #3335 from vespa-engine/update-gpt-4-model-and-fix…

…-paragraph-escaping fix(feed-split): Remove unnecessary backslashes and improve title handling
vespa-engine · Aug 26, 2024 · 1240363 · 1240363
2 parents 4c7dd2a + 09cd4ed
commit 1240363
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 13 deletions.
diff --git a/_plugins-vespafeed/vespa_index_generator.rb b/_plugins-vespafeed/vespa_index_generator.rb
@@ -12,11 +12,26 @@ class VespaIndexGenerator < Jekyll::Generator
         def generate(site)
             namespace = site.config["search"]["namespace"]
             operations = []
+            puts "::debug::VespaIndexGenerator is processing pages"
+
+            if site.pages.empty?
+                # Drop out with an error
+                puts "::error::No pages found!"
+                return false
+            end
+
+            puts "::debug::Pages found: #{site.pages.size}"
             site.pages.each do |page|
-                next if page.path.start_with?("css/") ||
-                        page.url.start_with?("/redirects.json") ||
-                        is_empty(page)
-                if page.data["index"]
+                # Skip pages that should not be indexed
+                next if (
+                    page.path.start_with?("css/") ||
+                    page.url.start_with?("/redirects.json") ||
+                    page.url.start_with?("/search.html") ||
+                    is_empty(page)
+                )
+
+                if page.data["index"] == true
+                    puts "::debug::Processing page: #{page.url}"
                     url = page.url
                     url += 'index.html' if url[-1, 1] == '/'
                     text = extract_text(page)
@@ -35,11 +50,16 @@ def generate(site)
                     fields[:outlinks] = outlinks if !outlinks.empty?
                     fields[:headers]  = headers  if !headers.empty?
                     fields[:keywords] = keywords if !keywords.empty?
-                    operations.push({:put => "id:" + namespace + ":doc::" + namespace + url,
-                                     :fields => fields})
+                    operations.push({
+                        :put => "id:" + namespace + ":doc::" + namespace + url,
+                        :fields => fields
+                    })
+                else
+                    puts "::debug::Page not indexed: #{page.url}, index flag: #{page.data['index']}"
                 end
             end
             json = JSON.pretty_generate(operations)
+            puts "::debug::Writing index file: #{namespace}_index.json"
             File.open(namespace + "_index.json", "w") { |f| f.write(json) }
         end
 

diff --git a/feed-split.py b/feed-split.py
@@ -11,7 +11,7 @@
 import tiktoken
 import urllib.parse
 
-encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+encoding = tiktoken.encoding_for_model("gpt-4o-mini")
 note_pattern = re.compile(r"{%\s*include.*?%}", flags=re.DOTALL)
 highlight_pattern = re.compile(r"{%\s*.*?\s%}", flags=re.DOTALL)
 
@@ -47,9 +47,6 @@ def is_selfhosted_doc(doc):
         return True
     return False
 
-def remove_escape(text):
-    return text.replace("\\_","_")
-
 def create_text_doc(doc, paragraph, paragraph_id, header):
     id = doc['put']
     #id:open:doc::open/en/access-logging.html#
@@ -67,7 +64,7 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
             "path": fields['path'],
             "doc_id": fields['path'],
             "namespace": new_namespace,
-            "content": remove_escape(paragraph),
+            "content": paragraph,
             "content_tokens": n_tokens,
             "base_uri": sys.argv[2],
             "selfhosted": is_selfhosted_doc(doc)
@@ -77,12 +74,13 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
     if header:
         title = fields['title']
         new_title = title + " - " + header
-        new_doc["fields"]["title"] = remove_escape(new_title)
+        new_doc["fields"]["title"] = new_title
 
     if paragraph_id is None:
         paragraph_id = str(random.randint(0,1000))
 
-    new_doc['fields']['path'] = remove_escape(new_doc['fields']['path'] + "#" + paragraph_id.replace("?",""))
+    new_doc['fields']['path'] = new_doc['fields']['path'] + \
+        "#" + paragraph_id.replace("?", "")
     new_doc['put'] = new_doc['put'] + "-" + urllib.parse.quote(paragraph_id)
 
     return new_doc
@@ -226,6 +224,9 @@ def main():
                 paragraph = paragraph.replace("```\nraw","```\n")
                 paragraph = paragraph.replace("```\njava","```java\n")
 
+                # Necessary backslashes and quotes will be added when json-serialized.
+                paragraph = paragraph.replace("\\", "")
+
                 paragraph = remove_jekyll(paragraph)
 
                 if paragraph: