Skip to content

Commit

Permalink
feat(feed-split): Remove unnecessary backslashes and improve title ha…
Browse files Browse the repository at this point in the history
…ndling
  • Loading branch information
esolitos authored and gitbutler-client committed Aug 26, 2024
1 parent db2b6cb commit e0515bf
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions feed-split.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import tiktoken
import urllib.parse

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
note_pattern = re.compile(r"{%\s*include.*?%}", flags=re.DOTALL)
highlight_pattern = re.compile(r"{%\s*.*?\s%}", flags=re.DOTALL)

Expand Down Expand Up @@ -47,9 +47,6 @@ def is_selfhosted_doc(doc):
return True
return False

def remove_escape(text):
return text.replace("\\_","_")

def create_text_doc(doc, paragraph, paragraph_id, header):
id = doc['put']
#id:open:doc::open/en/access-logging.html#
Expand All @@ -67,7 +64,7 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
"path": fields['path'],
"doc_id": fields['path'],
"namespace": new_namespace,
"content": remove_escape(paragraph),
"content": paragraph,
"content_tokens": n_tokens,
"base_uri": sys.argv[2],
"selfhosted": is_selfhosted_doc(doc)
Expand All @@ -77,12 +74,13 @@ def create_text_doc(doc, paragraph, paragraph_id, header):
if header:
title = fields['title']
new_title = title + " - " + header
new_doc["fields"]["title"] = remove_escape(new_title)
new_doc["fields"]["title"] = new_title

if paragraph_id is None:
paragraph_id = str(random.randint(0,1000))

new_doc['fields']['path'] = remove_escape(new_doc['fields']['path'] + "#" + paragraph_id.replace("?",""))
new_doc['fields']['path'] = new_doc['fields']['path'] + \
"#" + paragraph_id.replace("?", "")
new_doc['put'] = new_doc['put'] + "-" + urllib.parse.quote(paragraph_id)

return new_doc
Expand Down Expand Up @@ -226,6 +224,9 @@ def main():
paragraph = paragraph.replace("```\nraw","```\n")
paragraph = paragraph.replace("```\njava","```java\n")

# Necessary backslashes and quotes will be added when json-serialized.
paragraph = paragraph.replace("\\", "")

paragraph = remove_jekyll(paragraph)

if paragraph:
Expand Down

0 comments on commit e0515bf

Please sign in to comment.