Skip to content

Commit

Permalink
Merge pull request #3335 from vespa-engine/update-gpt-4-model-and-fix…
Browse files Browse the repository at this point in the history
…-paragraph-escaping

fix(feed-split): Remove unnecessary backslashes and improve title handling
  • Loading branch information
bjormel authored Aug 26, 2024
2 parents 4c7dd2a + 09cd4ed commit 1240363
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 13 deletions.
32 changes: 26 additions & 6 deletions _plugins-vespafeed/vespa_index_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,26 @@ class VespaIndexGenerator < Jekyll::Generator
def generate(site)
namespace = site.config["search"]["namespace"]
operations = []
puts "::debug::VespaIndexGenerator is processing pages"

if site.pages.empty?
# Drop out with an error
puts "::error::No pages found!"
return false
end

puts "::debug::Pages found: #{site.pages.size}"
site.pages.each do |page|
next if page.path.start_with?("css/") ||
page.url.start_with?("/redirects.json") ||
is_empty(page)
if page.data["index"]
# Skip pages that should not be indexed
next if (
page.path.start_with?("css/") ||
page.url.start_with?("/redirects.json") ||
page.url.start_with?("/search.html") ||
is_empty(page)
)

if page.data["index"] == true
puts "::debug::Processing page: #{page.url}"
url = page.url
url += 'index.html' if url[-1, 1] == '/'
text = extract_text(page)
Expand All @@ -35,11 +50,16 @@ def generate(site)
fields[:outlinks] = outlinks if !outlinks.empty?
fields[:headers] = headers if !headers.empty?
fields[:keywords] = keywords if !keywords.empty?
operations.push({:put => "id:" + namespace + ":doc::" + namespace + url,
:fields => fields})
operations.push({
:put => "id:" + namespace + ":doc::" + namespace + url,
:fields => fields
})
else
puts "::debug::Page not indexed: #{page.url}, index flag: #{page.data['index']}"
end
end
json = JSON.pretty_generate(operations)
puts "::debug::Writing index file: #{namespace}_index.json"
File.open(namespace + "_index.json", "w") { |f| f.write(json) }
end

Expand Down
15 changes: 8 additions & 7 deletions feed-split.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import tiktoken
import urllib.parse

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
note_pattern = re.compile(r"{%\s*include.*?%}", flags=re.DOTALL)
highlight_pattern = re.compile(r"{%\s*.*?\s%}", flags=re.DOTALL)

Expand Down Expand Up @@ -47,9 +47,6 @@ def is_selfhosted_doc(doc):
return True
return False

def remove_escape(text):
return text.replace("\\_","_")

def create_text_doc(doc, paragraph, paragraph_id, header):
id = doc['put']
#id:open:doc::open/en/access-logging.html#
Expand All @@ -67,7 +64,7 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
"path": fields['path'],
"doc_id": fields['path'],
"namespace": new_namespace,
"content": remove_escape(paragraph),
"content": paragraph,
"content_tokens": n_tokens,
"base_uri": sys.argv[2],
"selfhosted": is_selfhosted_doc(doc)
Expand All @@ -77,12 +74,13 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
if header:
title = fields['title']
new_title = title + " - " + header
new_doc["fields"]["title"] = remove_escape(new_title)
new_doc["fields"]["title"] = new_title

if paragraph_id is None:
paragraph_id = str(random.randint(0,1000))

new_doc['fields']['path'] = remove_escape(new_doc['fields']['path'] + "#" + paragraph_id.replace("?",""))
new_doc['fields']['path'] = new_doc['fields']['path'] + \
"#" + paragraph_id.replace("?", "")
new_doc['put'] = new_doc['put'] + "-" + urllib.parse.quote(paragraph_id)

return new_doc
Expand Down Expand Up @@ -226,6 +224,9 @@ def main():
paragraph = paragraph.replace("```\nraw","```\n")
paragraph = paragraph.replace("```\njava","```java\n")

# Necessary backslashes and quotes will be added when json-serialized.
paragraph = paragraph.replace("\\", "")

paragraph = remove_jekyll(paragraph)

if paragraph:
Expand Down

0 comments on commit 1240363

Please sign in to comment.