From 1f98eaddb94f11a1d7e68233ff9cb78a630c825b Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 12:45:56 +0000 Subject: [PATCH 01/19] =?UTF-8?q?reruirements=E3=81=AE=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index b679205..f83ee79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,44 @@ aiohappyeyeballs==2.4.3 ; python_version >= "3.10" and python_version < "4.0" -aiohttp==3.10.10 ; python_version >= "3.10" and python_version < "4.0" -aiosignal==1.3.2 ; python_version >= "3.10" and python_version < "4.0" +aiohttp==3.10.11 ; python_version >= "3.10" and python_version < "4.0" +aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0" annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0" anyio==4.6.0 ; python_version >= "3.10" and python_version < "4.0" async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11" attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0" azure-core==1.31.0 ; python_version >= "3.10" and python_version < "4.0" -azure-cosmos==4.7.0 ; python_version >= "3.10" and python_version < "4.0" +azure-cosmos==4.9.0 ; python_version >= "3.10" and python_version < "4.0" certifi==2024.8.30 ; python_version >= "3.10" and python_version < "4.0" -charset-normalizer==3.4.1 ; python_version >= "3.10" and python_version < "4.0" +charset-normalizer==3.4.0 ; python_version >= "3.10" and python_version < "4.0" click==8.1.7 ; python_version >= "3.10" and python_version < "4.0" colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Windows" dataclasses-json==0.6.7 ; python_version >= "3.10" and python_version < "4.0" distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0" -duckduckgo-search==6.3.2 ; python_version >= "3.10" and python_version < "4.0" +duckduckgo-search==6.3.7 ; python_version >= "3.10" and python_version < "4.0" exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11" frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0" greenlet==3.1.1 ; python_version < "3.13" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.10" h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0" httpcore==1.0.6 ; python_version >= "3.10" and python_version < "4.0" +httpx-sse==0.4.0 ; python_version >= "3.10" and python_version < "4.0" httpx==0.27.2 ; python_version >= "3.10" and python_version < "4.0" idna==3.10 ; python_version >= "3.10" and python_version < "4.0" jiter==0.6.1 ; python_version >= "3.10" and python_version < "4.0" jsonpatch==1.33 ; python_version >= "3.10" and python_version < "4.0" jsonpointer==3.0.0 ; python_version >= "3.10" and python_version < "4.0" -langchain-community==0.3.3 ; python_version >= "3.10" and python_version < "4.0" -langchain-core==0.3.12 ; python_version >= "3.10" and python_version < "4.0" -langchain-openai==0.2.3 ; python_version >= "3.10" and python_version < "4.0" -langchain-text-splitters==0.3.0 ; python_version >= "3.10" and python_version < "4.0" -langchain==0.3.4 ; python_version >= "3.10" and python_version < "4.0" -langsmith==0.1.137 ; python_version >= "3.10" and python_version < "4.0" +langchain-community==0.3.13 ; python_version >= "3.10" and python_version < "4.0" +langchain-core==0.3.28 ; python_version >= "3.10" and python_version < "4.0" +langchain-openai==0.2.14 ; python_version >= "3.10" and python_version < "4.0" +langchain-text-splitters==0.3.4 ; python_version >= "3.10" and python_version < "4.0" +langchain==0.3.13 ; python_version >= "3.10" and python_version < "4.0" +langsmith==0.1.147 ; python_version >= "3.10" and python_version < "4.0" marshmallow==3.22.0 ; python_version >= "3.10" and python_version < "4.0" multidict==6.1.0 ; python_version >= "3.10" and python_version < "4.0" mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0" numpy==1.26.4 ; python_version >= "3.10" and python_version < "4.0" -openai==1.52.0 ; python_version >= "3.10" and python_version < "4.0" -orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" +openai==1.58.1 ; python_version >= "3.10" and python_version < "4.0" +orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" packaging==24.1 ; python_version >= "3.10" and python_version < "4.0" -primp==0.6.4 ; python_version >= "3.10" and python_version < "4.0" +primp==0.8.1 ; python_version >= "3.10" and python_version < "4.0" propcache==0.2.0 ; python_version >= "3.10" and python_version < "4.0" pydantic-core==2.23.4 ; python_version >= "3.10" and python_version < "4.0" pydantic-settings==2.5.2 ; python_version >= "3.10" and python_version < "4.0" From d9e06a0c31eb32b752105f69d61e5f863974dc0c Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 13:09:20 +0000 Subject: [PATCH 02/19] =?UTF-8?q?md=5Fformatter=E9=96=A2=E6=95=B0=E3=81=AE?= =?UTF-8?q?=E5=BC=95=E6=95=B0=E3=81=ABtitle=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/document_formatter.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index 2b27712..38a7d4b 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -124,6 +124,7 @@ def add_metadata( def md_formatter( text: str, + title: str | None = None, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, **kwargs: Any @@ -131,6 +132,7 @@ def md_formatter( """Markdown形式のテキストをフォーマットする関数 Args: text (str): Markdown形式のテキスト + title (str, optional): タイトル. chunk_size (int, optional): 分割するサイズ. chunk_overlap (int, optional): オーバーラップのサイズ. @@ -138,7 +140,7 @@ def md_formatter( """ formatted_docs: list[Document] = [] for doc in markdown_splitter(text): - t = _find_header(doc) + t = _find_header(doc) if title is None else title if len(doc.page_content) > chunk_size: rdocs = recursive_document_splitter( [doc], @@ -219,8 +221,8 @@ def print_docs(docs: list[Document]) -> None: print() - docs = md_formatter(md_text) + docs = md_formatter(md_text, title="Sample Markdown") print_docs(docs) - docs = text_formatter(md_text) - print_docs(docs) + # docs = text_formatter(md_text) + # print_docs(docs) From 78b834ad13b08c8a8197a0923a3da364606b244b Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 13:15:45 +0000 Subject: [PATCH 03/19] =?UTF-8?q?md=5Fformatter=E9=96=A2=E6=95=B0=E3=81=A7?= =?UTF-8?q?=E3=83=A1=E3=82=BF=E3=83=87=E3=83=BC=E3=82=BF=E3=82=92kwargs?= =?UTF-8?q?=E3=81=A7=E3=81=AF=E7=84=A1=E3=81=8F=E6=98=8E=E7=A4=BA=E7=9A=84?= =?UTF-8?q?=E3=81=AB=E5=8F=97=E3=81=91=E5=8F=96=E3=82=8B=E3=82=88=E3=81=86?= =?UTF-8?q?=E3=81=AB=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/document_formatter.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index 38a7d4b..a02fb53 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -97,6 +97,7 @@ def add_metadata( source (str, optional): ソース. with_timestamp (bool, optional): タイムスタンプの有無. Defaults to True. with_section_number (bool, optional): セクション番号の有無. Defaults to False. + **kwargs: その他のメタデータ. """ i = 1 date = datetime.now().strftime("%Y-%m-%d") @@ -125,20 +126,23 @@ def add_metadata( def md_formatter( text: str, title: str | None = None, + metadata: dict[str, Any] | None = None, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, - **kwargs: Any ) -> list[Document]: """Markdown形式のテキストをフォーマットする関数 Args: text (str): Markdown形式のテキスト title (str, optional): タイトル. + metadata (dict[str, Any], optional): メタデータ. chunk_size (int, optional): 分割するサイズ. chunk_overlap (int, optional): オーバーラップのサイズ. chunk_sizeを超えるテキストは再分割し、メタデータにセクション番号を付与します. """ formatted_docs: list[Document] = [] + _metadata = metadata if metadata is not None else {} + for doc in markdown_splitter(text): t = _find_header(doc) if title is None else title if len(doc.page_content) > chunk_size: @@ -151,13 +155,13 @@ def md_formatter( rdocs, title=t if t is not None else rdocs[0].page_content, with_section_number=True, - **kwargs + **_metadata ) else: formatted_docs += add_metadata( [doc], title=t if t is not None else doc.page_content, - **kwargs + **_metadata ) return formatted_docs From 25f36c0b5b5e3d3bcf1c7fb1010f9dd91edf6a37 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 13:28:23 +0000 Subject: [PATCH 04/19] =?UTF-8?q?text=5Fformatter=E3=82=92=E5=90=8C?= =?UTF-8?q?=E6=A7=98=E3=81=AB=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/document_formatter.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index a02fb53..c909346 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -169,13 +169,16 @@ def md_formatter( def text_formatter( text: str, separator: str = "\n\n", + title: str | None = None, + metadata: dict[str, Any] | None = None, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, - **kwargs: Any ) -> list[Document]: """テキストをフォーマットする関数 Args: text (str): テキスト + title (str, optional): タイトル. + metadata (dict[str, Any], optional): メタデータ. separator (str, optional): 区切り文字. chunk_size (int, optional): 分割するサイズ. chunk_overlap (int, optional): オーバーラップのサイズ. @@ -190,9 +193,9 @@ def text_formatter( ) return add_metadata( docs, - title=docs[0].page_content, - with_section_number=True, - **kwargs + title=docs[0].page_content if title is None else title, + with_section_number=True if len(docs) > 1 else False, + **metadata if metadata is not None else {}, ) if __name__ == "__main__": @@ -225,8 +228,8 @@ def print_docs(docs: list[Document]) -> None: print() - docs = md_formatter(md_text, title="Sample Markdown") + docs = md_formatter(md_text, title="hogehogehoge", metadata={"fuga": "piyopiyo"}) print_docs(docs) - # docs = text_formatter(md_text) - # print_docs(docs) + docs = text_formatter(md_text, title="hogehogehoge", metadata={"fuga": "piyopiyo"}) + print_docs(docs) From 22d757d450587173fd90ddb5bdaed94634141129 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 13:32:10 +0000 Subject: [PATCH 05/19] =?UTF-8?q?create=5Fdocument=E3=81=A7=E3=81=AE?= =?UTF-8?q?=E3=83=95=E3=82=A9=E3=83=BC=E3=83=9E=E3=83=83=E3=83=88=E9=96=A2?= =?UTF-8?q?=E6=95=B0=E3=81=AE=E5=91=BC=E3=81=B3=E5=87=BA=E3=81=97=E3=82=92?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index e8be466..507590c 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -84,12 +84,15 @@ def __init__( def create_document( self, text: str, - text_type: Literal["markdown", "plain"] = "markdown" + text_type: Literal["markdown", "plain"] = "markdown", + title: str | None = None, + metadata: dict[str, Any] | None = None, ) -> list[str]: """データベースに新しいdocumentを作成する関数""" logger.info("新しいdocumentを作成します") texts, metadatas = self._division_document( - md_formatter(text) if text_type == "markdown" else text_formatter(text) + md_formatter(text, title, metadata) if text_type == "markdown" + else text_formatter(text, title=title, metadata=metadata) ) ids = self._insert_texts(texts, metadatas) return ids From 936d440d51b1228357b1e3f927bc411e6c7f873c Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 11 Jan 2025 14:43:43 +0000 Subject: [PATCH 06/19] =?UTF-8?q?docs=E3=81=AB=E3=83=95=E3=82=A9=E3=83=BC?= =?UTF-8?q?=E3=83=9E=E3=83=83=E3=83=88=E9=96=A2=E6=95=B0=E3=81=AE=E8=AA=AC?= =?UTF-8?q?=E6=98=8E=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/document_formatter.md | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/document_formatter.md diff --git a/docs/document_formatter.md b/docs/document_formatter.md new file mode 100644 index 0000000..f3c816a --- /dev/null +++ b/docs/document_formatter.md @@ -0,0 +1,62 @@ +# DocumentFormatter + +`CosmosDBManager`クラスからドキュメントを追加する場合`src/sc_system_ai/template/document_formatter.py`の関数群を使用します。 + +## 基本動作 + +テキストを分割する関数は、文章を1000文字程度で分割します。この大きさは引数`chunk_size`、`chunk_overlap`から指定可能です。 + +以下のメタデータを付与します。 + +- created_at : 作成日時 +- updated_at : 更新日時 + +### `md_formatter()` + +#### 引数 + +| 引数名 | 型 | 説明 | +|----------------|-------------------|--------------------------------| +| `text` | str | Markdown形式のテキスト | +| `title` | str (optional) | タイトル | +| `metadata` | dict[str, Any] (optional) | メタデータ | +| `chunk_size` | int (optional) | 分割するサイズ | +| `chunk_overlap`| int (optional) | オーバーラップのサイズ | + +#### 動作 + +マークダウン形式のテキストを分割し、メタデータを付与します。 +`Document`オブジェクトを返却します。 +メタデータにはヘッダーが付与されています。 + +テキストの分割はヘッダー毎に行います。 +分割したテキストがチャンクサイズを超える場合また分割を行います。 +2度目の分割を行ったテキストにはセクション番号がメタデータとして付与されます。 + +`title`を与えず呼び出した場合、対応するヘッダーをタイトルとしてメタデータに与えます。 +ヘッダーがない場合は分割後のテキストの最初のテキストをタイトルとします。 + +### `text_formatter()` + +#### 引数 + +| 引数名 | 型 | 説明 | +|----------------|-------------------|--------------------------------| +| `text` | str | テキスト | +| `title` | str (optional) | タイトル | +| `metadata` | dict[str, Any] (optional) | メタデータ | +| `separator` | str (optional) | 区切り文字 | +| `chunk_size` | int (optional) | 分割するサイズ | +| `chunk_overlap`| int (optional) | オーバーラップのサイズ | + +#### 動作 + +セパレータとチャンクサイズで分割を行い、メタデータを付与します。 + +`title`を与えず呼び出した場合、分割後のテキストの最初のテキストをタイトルとします。 + +## `CosmosDBManager`での動作 + +`create_document`メソッドでベクターストアにドキュメントを作成します。 + +`updata_document`メソッドではメタデータ`updated_at`の更新を行います。 From 782d6f89da3ee4f0c321c18cb68c7d1593035ec9 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 18:52:25 +0000 Subject: [PATCH 07/19] =?UTF-8?q?add=5Fmetadata=E3=81=A7section=5Fnumber?= =?UTF-8?q?=E4=BB=98=E4=B8=8E=E3=81=A8=E5=90=8C=E6=99=82=E3=81=ABgroup=5Fi?= =?UTF-8?q?d=E3=82=92=E4=BB=98=E4=B8=8E=E3=81=99=E3=82=8B=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../template/document_formatter.py | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index c909346..a1a505f 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -1,6 +1,7 @@ import re from datetime import datetime from typing import Any +from uuid import uuid4 from langchain_core.documents import Document from langchain_text_splitters import ( @@ -99,29 +100,39 @@ def add_metadata( with_section_number (bool, optional): セクション番号の有無. Defaults to False. **kwargs: その他のメタデータ. """ - i = 1 - date = datetime.now().strftime("%Y-%m-%d") - for doc in documents: - doc.metadata["title"] = title - - if source is not None and \ - doc.metadata.get("source") is None: - doc.metadata["source"] = source - - if with_timestamp and \ - doc.metadata.get("created_at") is None: - doc.metadata["created_at"] = date - doc.metadata["updated_at"] = date - - if with_section_number and \ - doc.metadata.get("section_number") is None: - doc.metadata["section_number"] = i - i += 1 - - for key, value in kwargs.items(): - doc.metadata[key] = value + m: dict[str, Any] = { + "title": title, **kwargs + } + + if source is not None: + m["source"] = source + if with_timestamp: + date = datetime.now().strftime("%Y-%m-%d") + m["created_at"] = date + m["updated_at"] = date + doc_id = str(uuid4()) + + return [ + _add_metadata( + doc, + {**m, "section_number": i, "group_id": doc_id} + if with_section_number else m + ) + for i, doc in enumerate(documents, start=1) + ] - return documents +def _add_metadata( + document: Document, + metadata: dict[str, Any] +) -> Document: + """メタデータを追加する関数 + Args: + document (Document): ドキュメント + metadata (dict[str, Any]): メタデータ. + """ + for key, value in metadata.items(): + document.metadata[key] = value + return document def md_formatter( text: str, @@ -151,6 +162,7 @@ def md_formatter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) + formatted_docs += add_metadata( rdocs, title=t if t is not None else rdocs[0].page_content, From 6ecd10ad7a205ee4b58c55c2e65275262655eea2 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 19:38:01 +0000 Subject: [PATCH 08/19] =?UTF-8?q?id=E3=81=A7=E3=82=AF=E3=82=A8=E3=83=AA?= =?UTF-8?q?=E3=82=92=E8=A1=8C=E3=81=86=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89?= =?UTF-8?q?=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 51 ++++++++++++++++++----- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 507590c..7b4341a 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -81,6 +81,33 @@ def __init__( create_container=create_container, ) + def read_item_by_id( + self, + id: str, + values: list[str] | None = None, + ) -> dict[str, Any] | None: + """idを指定してdocumentを読み込む関数""" + logger.info(f"{id=}のdocumentを読み込みます") + + query = "SELECT " + if values is not None: + query += ", ".join(["c." + value for value in values]) + " " + else: + query += "* " + parameters = [{"name": "@id", "value": id}] + + item = self._container.query_items( + query=query, + parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト + enable_cross_partition_query=True + ) + + if not item: + logger.error(f"{id=}のdocumentが見つかりませんでした") + return None + else: + return item.next() + def create_document( self, text: str, @@ -119,7 +146,6 @@ def update_document( # metadataのupdated_atを更新 query = "SELECT c.metadata FROM c WHERE c.id = @id" parameters = [{"name": "@id", "value": id}] - try: item = self._container.query_items( query=query, @@ -161,9 +187,12 @@ def read_all_documents(self) -> list[Document]: def get_source_by_id(self, id: str) -> str: """idを指定してsourceを取得する関数""" logger.info(f"{id=}のsourceを取得します") - query = "SELECT c.text FROM c WHERE c.id = " + f"'{id}'" + query = "SELECT c.text FROM c WHERE c.id = @id" + parameters = [{"name": "@id", "value": id}] item = self._container.query_items( - query=query, enable_cross_partition_query=True + query=query, + parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト + enable_cross_partition_query=True ).next() result = item["text"] @@ -180,9 +209,9 @@ def get_source_by_id(self, id: str) -> str: cosmos_manager = CosmosDBManager() query = "京都テック" # results = cosmos_manager.read_all_documents() - results = cosmos_manager.similarity_search(query, k=1) - print(results[0]) - print(results[0].metadata["id"]) + # results = cosmos_manager.similarity_search(query, k=1) + # print(results[0]) + # print(results[0].metadata["id"]) # # idで指定したドキュメントのsourceを取得 # ids = results[0].metadata["id"] @@ -190,8 +219,8 @@ def get_source_by_id(self, id: str) -> str: # doc = cosmos_manager.get_source_by_id(ids) # print(doc) -# # documentを更新 -# text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 -# エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" -# _id = "c55bb571-498a-4db9-9da0-e9e35d46906b" -# print(cosmos_manager.update_document(_id, text)) + # documentを更新 + text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 +エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" + _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" + print(cosmos_manager.update_document(_id, text)) From 310954fb31bbd900247602b6a22214b7d0da2950 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 19:44:15 +0000 Subject: [PATCH 09/19] =?UTF-8?q?=E3=83=89=E3=82=AD=E3=83=A5=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88=E3=81=8C=E8=A6=8B=E3=81=A4=E3=81=8B=E3=82=89?= =?UTF-8?q?=E3=81=AA=E3=81=84=E6=99=82=E3=81=AB=E3=82=A8=E3=83=A9=E3=83=BC?= =?UTF-8?q?=E3=82=92=E6=8A=95=E3=81=92=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 7b4341a..f944477 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -85,7 +85,7 @@ def read_item_by_id( self, id: str, values: list[str] | None = None, - ) -> dict[str, Any] | None: + ) -> dict[str, Any]: """idを指定してdocumentを読み込む関数""" logger.info(f"{id=}のdocumentを読み込みます") @@ -104,9 +104,8 @@ def read_item_by_id( if not item: logger.error(f"{id=}のdocumentが見つかりませんでした") - return None - else: - return item.next() + raise ValueError("documentが見つかりませんでした") + return item.next() def create_document( self, From 1b7643f297a65421cdd72545c620e13293a95edb Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 20:45:00 +0000 Subject: [PATCH 10/19] =?UTF-8?q?=E6=9D=A1=E4=BB=B6=E3=82=92=E5=8F=97?= =?UTF-8?q?=E3=81=91=E5=8F=96=E3=82=8A=E3=82=AF=E3=82=A8=E3=83=AA=E3=81=A7?= =?UTF-8?q?=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 51 ++++++++++++++++------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index f944477..e69527c 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -81,10 +81,10 @@ def __init__( create_container=create_container, ) - def read_item_by_id( + def read_item( self, - id: str, values: list[str] | None = None, + condition: dict[str, Any] | None = None, ) -> dict[str, Any]: """idを指定してdocumentを読み込む関数""" logger.info(f"{id=}のdocumentを読み込みます") @@ -94,18 +94,28 @@ def read_item_by_id( query += ", ".join(["c." + value for value in values]) + " " else: query += "* " - parameters = [{"name": "@id", "value": id}] - - item = self._container.query_items( - query=query, - parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト - enable_cross_partition_query=True - ) + query += "FROM c" + + parameters = [] + if condition is not None: + query += " WHERE" + for key, value in condition.items(): + name = key if "." not in key else key.split(".")[-1] + query += f" c.{key} = @{name}" + parameters.append({"name": f"@{name}", "value": value}) + query += " AND" + query = query[:-4] - if not item: + try: + item = self._container.query_items( + query=query, + parameters=parameters if parameters else None, + enable_cross_partition_query=True + ).next() + except StopIteration: logger.error(f"{id=}のdocumentが見つかりませんでした") - raise ValueError("documentが見つかりませんでした") - return item.next() + raise ValueError("documentが見つかりませんでした") from None + return item def create_document( self, @@ -219,7 +229,16 @@ def get_source_by_id(self, id: str) -> str: # print(doc) # documentを更新 - text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 -エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" - _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" - print(cosmos_manager.update_document(_id, text)) +# text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 +# エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" +# _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" +# print(cosmos_manager.update_document(_id, text)) + + item = cosmos_manager.read_item( + values=["text", "metadata"], + condition={ + "id": "989af836-cf9b-44c7-93d2-deff7aeae51f", + "metadata.updated_at": "2025-01-13", + } + ) + print(item) From 9da99ea13b30f80a2250b7824c100a832547300d Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 20:53:31 +0000 Subject: [PATCH 11/19] =?UTF-8?q?replace=E3=82=92=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index e69527c..46000ca 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -100,7 +100,7 @@ def read_item( if condition is not None: query += " WHERE" for key, value in condition.items(): - name = key if "." not in key else key.split(".")[-1] + name = key if "." not in key else key.replace(".", "_") query += f" c.{key} = @{name}" parameters.append({"name": f"@{name}", "value": value}) query += " AND" From 71c9d5e3dec593c749787d623857fc4659839dbf Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 13 Jan 2025 21:24:12 +0000 Subject: [PATCH 12/19] =?UTF-8?q?=E7=B5=90=E6=9E=9C=E3=82=92list=E3=81=A7?= =?UTF-8?q?=E8=BF=94=E5=8D=B4=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 25 ++++++++++------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 46000ca..a6016a7 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -85,7 +85,7 @@ def read_item( self, values: list[str] | None = None, condition: dict[str, Any] | None = None, - ) -> dict[str, Any]: + ) -> list[dict[str, Any]]: """idを指定してdocumentを読み込む関数""" logger.info(f"{id=}のdocumentを読み込みます") @@ -106,15 +106,15 @@ def read_item( query += " AND" query = query[:-4] - try: - item = self._container.query_items( - query=query, - parameters=parameters if parameters else None, - enable_cross_partition_query=True - ).next() - except StopIteration: + item = list(self._container.query_items( + query=query, + parameters=parameters if parameters else None, + enable_cross_partition_query=True + )) + + if not item: logger.error(f"{id=}のdocumentが見つかりませんでした") - raise ValueError("documentが見つかりませんでした") from None + raise ValueError("documentが見つかりませんでした") return item def create_document( @@ -235,10 +235,7 @@ def get_source_by_id(self, id: str) -> str: # print(cosmos_manager.update_document(_id, text)) item = cosmos_manager.read_item( - values=["text", "metadata"], - condition={ - "id": "989af836-cf9b-44c7-93d2-deff7aeae51f", - "metadata.updated_at": "2025-01-13", - } + values=["metadata"], + condition={"id": "989af836-cf9b-44c7-93d2-deff7aeae51f"} ) print(item) From 8b731a690c3e04c4e8db24e96eebc3d7546dba97 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 14 Jan 2025 13:34:20 +0000 Subject: [PATCH 13/19] =?UTF-8?q?read=5Fitem=E3=82=92=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 33 +++++++---------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index a6016a7..bf28808 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -86,8 +86,8 @@ def read_item( values: list[str] | None = None, condition: dict[str, Any] | None = None, ) -> list[dict[str, Any]]: - """idを指定してdocumentを読み込む関数""" - logger.info(f"{id=}のdocumentを読み込みます") + """条件を指定してdocumentを読み込む関数""" + logger.info("documentを読み込みます") query = "SELECT " if values is not None: @@ -153,15 +153,9 @@ def update_document( logger.info("documentを更新します") # metadataのupdated_atを更新 - query = "SELECT c.metadata FROM c WHERE c.id = @id" - parameters = [{"name": "@id", "value": id}] try: - item = self._container.query_items( - query=query, - parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト - enable_cross_partition_query=True - ).next() - except StopIteration: + item = self.read_item(values=["metadata"], condition={"id": id})[0] + except ValueError: logger.error(f"{id=}のdocumentが見つかりませんでした") return "documentが見つかりませんでした" @@ -196,19 +190,12 @@ def read_all_documents(self) -> list[Document]: def get_source_by_id(self, id: str) -> str: """idを指定してsourceを取得する関数""" logger.info(f"{id=}のsourceを取得します") - query = "SELECT c.text FROM c WHERE c.id = @id" - parameters = [{"name": "@id", "value": id}] - item = self._container.query_items( - query=query, - parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト - enable_cross_partition_query=True - ).next() - - result = item["text"] - if type(result) is str: - return result - else: - return "sourceが見つかりませんでした" + try: + item = self.read_item(values=["text"], condition={"id": id}) + except ValueError: + return "documentが見つかりませんでした" + result = item[0]["text"] + return cast(str, result) if __name__ == "__main__": From 79724a7456a8ec49b7d56a4dc3cba192ed56634b Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 14 Jan 2025 13:59:15 +0000 Subject: [PATCH 14/19] =?UTF-8?q?md=5Fformatter=E3=81=A7=E6=96=87=E5=AD=97?= =?UTF-8?q?=E6=95=B0=E3=81=AB=E3=82=88=E3=82=8B=E5=88=86=E5=89=B2=E3=81=AE?= =?UTF-8?q?=E6=A9=9F=E8=83=BD=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../template/document_formatter.py | 34 +++++-------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index a1a505f..d169926 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -138,8 +138,6 @@ def md_formatter( text: str, title: str | None = None, metadata: dict[str, Any] | None = None, - chunk_size: int = CHUNK_SIZE, - chunk_overlap: int = CHUNK_OVERLAP, ) -> list[Document]: """Markdown形式のテキストをフォーマットする関数 Args: @@ -151,31 +149,15 @@ def md_formatter( chunk_sizeを超えるテキストは再分割し、メタデータにセクション番号を付与します. """ - formatted_docs: list[Document] = [] + docs = markdown_splitter(text) _metadata = metadata if metadata is not None else {} - - for doc in markdown_splitter(text): - t = _find_header(doc) if title is None else title - if len(doc.page_content) > chunk_size: - rdocs = recursive_document_splitter( - [doc], - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - - formatted_docs += add_metadata( - rdocs, - title=t if t is not None else rdocs[0].page_content, - with_section_number=True, - **_metadata - ) - else: - formatted_docs += add_metadata( - [doc], - title=t if t is not None else doc.page_content, - **_metadata - ) - + t = _find_header(docs[0]) if title is None else title + formatted_docs = add_metadata( + docs, + title=t if t is not None else docs[0].page_content, + with_section_number=True if len(docs) > 1 else False, + **_metadata, + ) return formatted_docs def text_formatter( From 2f3b48a28bd643bf2190d47947ee3cd409b17271 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 14 Jan 2025 15:03:00 +0000 Subject: [PATCH 15/19] =?UTF-8?q?title=E6=9B=B4=E6=96=B0=E5=87=A6=E7=90=86?= =?UTF-8?q?=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 49 +++++++++++++---------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index bf28808..588113a 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -147,29 +147,37 @@ def _division_document( def update_document( self, id: str, - text: str, + text: str | None = None, + text_type: Literal["markdown", "plain"] | None = None, + title: str | None = None, + metadata: dict[str, Any] | None = None, ) -> str: """データベースのdocumentを更新する関数""" logger.info("documentを更新します") + item = self.read_item(values=["text", "metadata"], condition={"id": id})[0] - # metadataのupdated_atを更新 - try: - item = self.read_item(values=["metadata"], condition={"id": id})[0] - except ValueError: - logger.error(f"{id=}のdocumentが見つかりませんでした") - return "documentが見つかりませんでした" + if title is not None: + self._title_updater(id, title, item["metadata"].get("group_id", None)) - metadata = item["metadata"] - metadata["updated_at"] = datetime.now().strftime("%Y-%m-%d") + return "" - to_upsert = { - "id": id, - "text": text, - self._embedding_key: self._embedding.embed_documents([text])[0], - "metadata": metadata, - } - self._container.upsert_item(body=to_upsert) - return id + def _title_updater(self, id: str, title: str, group_id: str | None) -> None: + """titleを更新する関数""" + if group_id is None: + ids = [id] + else: + data = self.read_item(values=["id"], condition={"metadata.group_id": group_id}) + ids = [cast(str, d["id"]) for d in data] + + patch = [{ + "op": "replace", + "path": "/metadata/title", + "value": title + }] + for _id in ids: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" @@ -221,8 +229,7 @@ def get_source_by_id(self, id: str) -> str: # _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" # print(cosmos_manager.update_document(_id, text)) - item = cosmos_manager.read_item( - values=["metadata"], - condition={"id": "989af836-cf9b-44c7-93d2-deff7aeae51f"} + cosmos_manager.update_document( + id="a1a83722-0086-4819-be99-32d28bfb7e5a", + title="hogehogehogehoge" ) - print(item) From 1ca1e27067c4ed359cb7a04cd56080cfc988de70 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 15 Jan 2025 14:16:18 +0000 Subject: [PATCH 16/19] =?UTF-8?q?metadata=E6=9B=B4=E6=96=B0=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 69 ++++++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 588113a..68c4885 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -151,6 +151,8 @@ def update_document( text_type: Literal["markdown", "plain"] | None = None, title: str | None = None, metadata: dict[str, Any] | None = None, + del_metadata: list[str] | None = None, + is_patch: bool = False, ) -> str: """データベースのdocumentを更新する関数""" logger.info("documentを更新します") @@ -159,9 +161,14 @@ def update_document( if title is not None: self._title_updater(id, title, item["metadata"].get("group_id", None)) + if metadata is not None: + self._metadata_updater( + id, metadata, None if is_patch else item["metadata"].get("group_id", None), del_metadata + ) + return "" - def _title_updater(self, id: str, title: str, group_id: str | None) -> None: + def _title_updater(self, id: str, title: str, group_id: str | None = None) -> None: """titleを更新する関数""" if group_id is None: ids = [id] @@ -179,6 +186,62 @@ def _title_updater(self, id: str, title: str, group_id: str | None) -> None: item=_id, partition_key=_id, patch_operations=patch ) + def _metadata_updater( + self, + id: str, + metadata: dict[str, Any], + group_id: str | None = None, + del_metadata: list[str] | None = None, + ) -> None: + """metadataを更新する関数""" + if group_id is None: + data = self.read_item(values=["metadata"], condition={"id": id})[0] + prev_metadata = [cast(dict[str, Any], data["metadata"])] + ids = [id] + else: + datas = self.read_item(values=["id", "metadata"], condition={"metadata.group_id": group_id}) + prev_metadata = [cast(dict[str, Any], d["metadata"]) for d in datas] + ids = [cast(str, d["id"]) for d in datas] + + for _id, pm in zip(ids, prev_metadata, strict=True): + patch = self._create_patch(pm, metadata, [] if del_metadata is None else del_metadata) + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + + def _create_patch( + self, + prev_metadata: dict[str, Any], + new_metadata: dict[str, Any], + del_metadata: list[str], + ) -> list[dict[str, Any]]: + """metadataの差分を取得しパッチ操作を定義する関数""" + patch = [] + for dm in del_metadata: + if dm in new_metadata: + raise ValueError(f"metadata:{dm}は新しいmetadataに含まれています") + if dm in prev_metadata: + patch.append({ + "op": "remove", + "path": f"/metadata/{dm}" + }) + + for key, value in new_metadata.items(): + if key not in prev_metadata: + patch.append({ + "op": "add", + "path": f"/metadata/{key}", + "value": value + }) + elif prev_metadata[key] != value: + patch.append({ + "op": "replace", + "path": f"/metadata/{key}", + "value": value + }) + return patch + + def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") @@ -231,5 +294,7 @@ def get_source_by_id(self, id: str) -> str: cosmos_manager.update_document( id="a1a83722-0086-4819-be99-32d28bfb7e5a", - title="hogehogehogehoge" + metadata={"title": "piyopiyo"}, + del_metadata=["source"], + is_patch=False, ) From e570cfde68134840f1b59884a3be02dd1fe3074a Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 15 Jan 2025 15:24:08 +0000 Subject: [PATCH 17/19] =?UTF-8?q?text=E6=9B=B4=E6=96=B0=E5=87=A6=E7=90=86?= =?UTF-8?q?=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 56 ++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 68c4885..bc5678e 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -153,9 +153,10 @@ def update_document( metadata: dict[str, Any] | None = None, del_metadata: list[str] | None = None, is_patch: bool = False, - ) -> str: + ) -> list[str]: """データベースのdocumentを更新する関数""" logger.info("documentを更新します") + result = [id] item = self.read_item(values=["text", "metadata"], condition={"id": id})[0] if title is not None: @@ -163,10 +164,17 @@ def update_document( if metadata is not None: self._metadata_updater( - id, metadata, None if is_patch else item["metadata"].get("group_id", None), del_metadata + id, metadata, del_metadata, None if is_patch else item["metadata"].get("group_id", None) + ) + + if text is not None: + if text_type is None: + raise TypeError("textを更新する際はtext_typeを指定してください。") + result = self._update_text( + id, text, text_type, item["metadata"].get("group_id", None) ) - return "" + return result def _title_updater(self, id: str, title: str, group_id: str | None = None) -> None: """titleを更新する関数""" @@ -190,20 +198,20 @@ def _metadata_updater( self, id: str, metadata: dict[str, Any], - group_id: str | None = None, del_metadata: list[str] | None = None, + group_id: str | None = None, ) -> None: """metadataを更新する関数""" if group_id is None: data = self.read_item(values=["metadata"], condition={"id": id})[0] - prev_metadata = [cast(dict[str, Any], data["metadata"])] + prev_metadatas = [cast(dict[str, Any], data["metadata"])] ids = [id] else: datas = self.read_item(values=["id", "metadata"], condition={"metadata.group_id": group_id}) - prev_metadata = [cast(dict[str, Any], d["metadata"]) for d in datas] + prev_metadatas = [cast(dict[str, Any], d["metadata"]) for d in datas] ids = [cast(str, d["id"]) for d in datas] - for _id, pm in zip(ids, prev_metadata, strict=True): + for _id, pm in zip(ids, prev_metadatas, strict=True): patch = self._create_patch(pm, metadata, [] if del_metadata is None else del_metadata) self._container.patch_item( item=_id, partition_key=_id, patch_operations=patch @@ -241,6 +249,33 @@ def _create_patch( }) return patch + def _update_text( + self, + id: str, + text: str, + text_type: Literal["markdown", "plain"], + group_id: str | None = None, + ) -> list[str]: + """textを更新する関数""" + created_at = self.read_item(values=["metadata.created_at"], condition={"id": id})[0]["created_at"] + if group_id is None: + self.delete_document_by_id(id) + else: + data = self.read_item(values=["id"], condition={"metadata.group_id": group_id}) + for d in data: + self.delete_document_by_id(d["id"]) + + ids = self.create_document(text, text_type) + patch = [{ + "op": "replace", + "path": "/metadata/created_at", + "value": created_at + }] + for _id in ids: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + return ids def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" @@ -287,14 +322,13 @@ def get_source_by_id(self, id: str) -> str: # print(doc) # documentを更新 -# text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 + text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 # エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" # _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" # print(cosmos_manager.update_document(_id, text)) cosmos_manager.update_document( id="a1a83722-0086-4819-be99-32d28bfb7e5a", - metadata={"title": "piyopiyo"}, - del_metadata=["source"], - is_patch=False, + text=text, + text_type="markdown", ) From 0e5127531f7d2245019f83c8517389c3027716e5 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 15 Jan 2025 15:45:01 +0000 Subject: [PATCH 18/19] =?UTF-8?q?update=5Fat=E3=81=AE=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 28 +++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index bc5678e..f36b954 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -174,6 +174,18 @@ def update_document( id, text, text_type, item["metadata"].get("group_id", None) ) + if any([title, metadata, del_metadata]): + date = datetime.now().strftime("%Y-%m-%d") + patch = [{ + "op": "replace", + "path": "/metadata/updated_at", + "value": date + }] + for _id in result: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + return result def _title_updater(self, id: str, title: str, group_id: str | None = None) -> None: @@ -277,6 +289,17 @@ def _update_text( ) return ids + def _update_updated_at(self, id: str) -> None: + """updated_atを更新する関数""" + patch = [{ + "op": "replace", + "path": "/metadata/updated_at", + "value": datetime.now().strftime("%Y-%m-%d") + }] + self._container.patch_item( + item=id, partition_key=id, patch_operations=patch + ) + def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") @@ -323,12 +346,13 @@ def get_source_by_id(self, id: str) -> str: # documentを更新 text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 -# エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" +エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" # _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" # print(cosmos_manager.update_document(_id, text)) + cosmos_manager.update_document( - id="a1a83722-0086-4819-be99-32d28bfb7e5a", + id="98941def-479c-4292-ad68-1d6dd9f4800e", text=text, text_type="markdown", ) From b461b91650cb96a4a1cb57316fa30cddbfcd79d5 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 15 Jan 2025 17:43:41 +0000 Subject: [PATCH 19/19] =?UTF-8?q?update=5Fupdated=5Fat=E3=82=92=E5=89=8A?= =?UTF-8?q?=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index f36b954..29a3111 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -289,17 +289,6 @@ def _update_text( ) return ids - def _update_updated_at(self, id: str) -> None: - """updated_atを更新する関数""" - patch = [{ - "op": "replace", - "path": "/metadata/updated_at", - "value": datetime.now().strftime("%Y-%m-%d") - }] - self._container.patch_item( - item=id, partition_key=id, patch_operations=patch - ) - def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") @@ -356,3 +345,8 @@ def get_source_by_id(self, id: str) -> str: text=text, text_type="markdown", ) + cosmos_manager.update_document( + id="98941def-479c-4292-ad68-1d6dd9f4800e", + text=text, + text_type="markdown", + )