From f16240a85c7317b9ca4112321ebd4692c91e8cb5 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Tue, 25 Jun 2024 13:17:49 -0700 Subject: [PATCH] Fixed sonnet json formatting issue (#293) * Fixed sonnet json formatting issue * PR comments - addedd notes and types --- paperqa/utils.py | 11 +++++++++++ pyproject.toml | 2 +- tests/test_paperqa.py | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/paperqa/utils.py b/paperqa/utils.py index efeb0dec5..1d54df346 100644 --- a/paperqa/utils.py +++ b/paperqa/utils.py @@ -184,4 +184,15 @@ def llm_read_json(text: str) -> dict: text = "{" + text.split("{", 1)[-1] # split anything after the last } text = text.rsplit("}", 1)[0] + "}" + + # escape new lines within strings + def replace_newlines(match: re.Match) -> str: + return match.group(0).replace("\n", "\\n") + + # Match anything between double quotes + # including escaped quotes and other escaped characters. + # https://regex101.com/r/VFcDmB/1 + pattern = r'"(?:[^"\\]|\\.)*"' + text = re.sub(pattern, replace_newlines, text) + return json.loads(text) diff --git a/pyproject.toml b/pyproject.toml index edc61396d..64f7d3e37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ name = "paper-qa" readme = "README.md" requires-python = ">=3.8" urls = {repository = "https://github.com/whitead/paper-qa"} -version = "4.8.0" +version = "4.8.1" [tool.codespell] check-filenames = true diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 9b4ecc7f1..3686bc06d 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -3,6 +3,7 @@ import os import pickle import tempfile +import textwrap from io import BytesIO from pathlib import Path @@ -457,6 +458,23 @@ def test_llm_read_json(example: str): assert llm_read_json(example) == {"example": "json"} +def test_llm_read_json_newlines(): + """Make sure that newlines in json are preserved and escaped.""" + example = textwrap.dedent( + """ + { + "summary": "A line + + Another line", + "relevance_score": 7 + }""" + ) + assert llm_read_json(example) == { + "summary": "A line\n\nAnother line", + "relevance_score": 7, + } + + @pytest.mark.asyncio() async def test_chain_completion(): client = AsyncOpenAI()