diff --git a/_plugins-vespafeed/vespa_index_generator.rb b/_plugins-vespafeed/vespa_index_generator.rb index 9f43dcae68..f5b44ddc93 100644 --- a/_plugins-vespafeed/vespa_index_generator.rb +++ b/_plugins-vespafeed/vespa_index_generator.rb @@ -12,11 +12,26 @@ class VespaIndexGenerator < Jekyll::Generator def generate(site) namespace = site.config["search"]["namespace"] operations = [] + puts "::debug::VespaIndexGenerator is processing pages" + + if site.pages.empty? + # Drop out with an error + puts "::error::No pages found!" + return false + end + + puts "::debug::Pages found: #{site.pages.size}" site.pages.each do |page| - next if page.path.start_with?("css/") || - page.url.start_with?("/redirects.json") || - is_empty(page) - if page.data["index"] + # Skip pages that should not be indexed + next if ( + page.path.start_with?("css/") || + page.url.start_with?("/redirects.json") || + page.url.start_with?("/search.html") || + is_empty(page) + ) + + if page.data["index"] == true + puts "::debug::Processing page: #{page.url}" url = page.url url += 'index.html' if url[-1, 1] == '/' text = extract_text(page) @@ -35,11 +50,16 @@ def generate(site) fields[:outlinks] = outlinks if !outlinks.empty? fields[:headers] = headers if !headers.empty? fields[:keywords] = keywords if !keywords.empty? - operations.push({:put => "id:" + namespace + ":doc::" + namespace + url, - :fields => fields}) + operations.push({ + :put => "id:" + namespace + ":doc::" + namespace + url, + :fields => fields + }) + else + puts "::debug::Page not indexed: #{page.url}, index flag: #{page.data['index']}" end end json = JSON.pretty_generate(operations) + puts "::debug::Writing index file: #{namespace}_index.json" File.open(namespace + "_index.json", "w") { |f| f.write(json) } end diff --git a/feed-split.py b/feed-split.py index 42bd9699e9..6944d1e437 100755 --- a/feed-split.py +++ b/feed-split.py @@ -11,7 +11,7 @@ import tiktoken import urllib.parse -encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") +encoding = tiktoken.encoding_for_model("gpt-4o-mini") note_pattern = re.compile(r"{%\s*include.*?%}", flags=re.DOTALL) highlight_pattern = re.compile(r"{%\s*.*?\s%}", flags=re.DOTALL) @@ -47,9 +47,6 @@ def is_selfhosted_doc(doc): return True return False -def remove_escape(text): - return text.replace("\\_","_") - def create_text_doc(doc, paragraph, paragraph_id, header): id = doc['put'] #id:open:doc::open/en/access-logging.html# @@ -67,7 +64,7 @@ def create_text_doc(doc, paragraph, paragraph_id, header): "path": fields['path'], "doc_id": fields['path'], "namespace": new_namespace, - "content": remove_escape(paragraph), + "content": paragraph, "content_tokens": n_tokens, "base_uri": sys.argv[2], "selfhosted": is_selfhosted_doc(doc) @@ -77,12 +74,13 @@ def create_text_doc(doc, paragraph, paragraph_id, header): if header: title = fields['title'] new_title = title + " - " + header - new_doc["fields"]["title"] = remove_escape(new_title) + new_doc["fields"]["title"] = new_title if paragraph_id is None: paragraph_id = str(random.randint(0,1000)) - new_doc['fields']['path'] = remove_escape(new_doc['fields']['path'] + "#" + paragraph_id.replace("?","")) + new_doc['fields']['path'] = new_doc['fields']['path'] + \ + "#" + paragraph_id.replace("?", "") new_doc['put'] = new_doc['put'] + "-" + urllib.parse.quote(paragraph_id) return new_doc @@ -226,6 +224,9 @@ def main(): paragraph = paragraph.replace("```\nraw","```\n") paragraph = paragraph.replace("```\njava","```java\n") + # Necessary backslashes and quotes will be added when json-serialized. + paragraph = paragraph.replace("\\", "") + paragraph = remove_jekyll(paragraph) if paragraph: