diff --git a/docs/document_formatter.md b/docs/document_formatter.md new file mode 100644 index 0000000..f3c816a --- /dev/null +++ b/docs/document_formatter.md @@ -0,0 +1,62 @@ +# DocumentFormatter + +`CosmosDBManager`クラスからドキュメントを追加する場合`src/sc_system_ai/template/document_formatter.py`の関数群を使用します。 + +## 基本動作 + +テキストを分割する関数は、文章を1000文字程度で分割します。この大きさは引数`chunk_size`、`chunk_overlap`から指定可能です。 + +以下のメタデータを付与します。 + +- created_at : 作成日時 +- updated_at : 更新日時 + +### `md_formatter()` + +#### 引数 + +| 引数名 | 型 | 説明 | +|----------------|-------------------|--------------------------------| +| `text` | str | Markdown形式のテキスト | +| `title` | str (optional) | タイトル | +| `metadata` | dict[str, Any] (optional) | メタデータ | +| `chunk_size` | int (optional) | 分割するサイズ | +| `chunk_overlap`| int (optional) | オーバーラップのサイズ | + +#### 動作 + +マークダウン形式のテキストを分割し、メタデータを付与します。 +`Document`オブジェクトを返却します。 +メタデータにはヘッダーが付与されています。 + +テキストの分割はヘッダー毎に行います。 +分割したテキストがチャンクサイズを超える場合また分割を行います。 +2度目の分割を行ったテキストにはセクション番号がメタデータとして付与されます。 + +`title`を与えず呼び出した場合、対応するヘッダーをタイトルとしてメタデータに与えます。 +ヘッダーがない場合は分割後のテキストの最初のテキストをタイトルとします。 + +### `text_formatter()` + +#### 引数 + +| 引数名 | 型 | 説明 | +|----------------|-------------------|--------------------------------| +| `text` | str | テキスト | +| `title` | str (optional) | タイトル | +| `metadata` | dict[str, Any] (optional) | メタデータ | +| `separator` | str (optional) | 区切り文字 | +| `chunk_size` | int (optional) | 分割するサイズ | +| `chunk_overlap`| int (optional) | オーバーラップのサイズ | + +#### 動作 + +セパレータとチャンクサイズで分割を行い、メタデータを付与します。 + +`title`を与えず呼び出した場合、分割後のテキストの最初のテキストをタイトルとします。 + +## `CosmosDBManager`での動作 + +`create_document`メソッドでベクターストアにドキュメントを作成します。 + +`updata_document`メソッドではメタデータ`updated_at`の更新を行います。 diff --git a/requirements.txt b/requirements.txt index b679205..f83ee79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,44 @@ aiohappyeyeballs==2.4.3 ; python_version >= "3.10" and python_version < "4.0" -aiohttp==3.10.10 ; python_version >= "3.10" and python_version < "4.0" -aiosignal==1.3.2 ; python_version >= "3.10" and python_version < "4.0" +aiohttp==3.10.11 ; python_version >= "3.10" and python_version < "4.0" +aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0" annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0" anyio==4.6.0 ; python_version >= "3.10" and python_version < "4.0" async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11" attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0" azure-core==1.31.0 ; python_version >= "3.10" and python_version < "4.0" -azure-cosmos==4.7.0 ; python_version >= "3.10" and python_version < "4.0" +azure-cosmos==4.9.0 ; python_version >= "3.10" and python_version < "4.0" certifi==2024.8.30 ; python_version >= "3.10" and python_version < "4.0" -charset-normalizer==3.4.1 ; python_version >= "3.10" and python_version < "4.0" +charset-normalizer==3.4.0 ; python_version >= "3.10" and python_version < "4.0" click==8.1.7 ; python_version >= "3.10" and python_version < "4.0" colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Windows" dataclasses-json==0.6.7 ; python_version >= "3.10" and python_version < "4.0" distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0" -duckduckgo-search==6.3.2 ; python_version >= "3.10" and python_version < "4.0" +duckduckgo-search==6.3.7 ; python_version >= "3.10" and python_version < "4.0" exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11" frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0" greenlet==3.1.1 ; python_version < "3.13" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.10" h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0" httpcore==1.0.6 ; python_version >= "3.10" and python_version < "4.0" +httpx-sse==0.4.0 ; python_version >= "3.10" and python_version < "4.0" httpx==0.27.2 ; python_version >= "3.10" and python_version < "4.0" idna==3.10 ; python_version >= "3.10" and python_version < "4.0" jiter==0.6.1 ; python_version >= "3.10" and python_version < "4.0" jsonpatch==1.33 ; python_version >= "3.10" and python_version < "4.0" jsonpointer==3.0.0 ; python_version >= "3.10" and python_version < "4.0" -langchain-community==0.3.3 ; python_version >= "3.10" and python_version < "4.0" -langchain-core==0.3.12 ; python_version >= "3.10" and python_version < "4.0" -langchain-openai==0.2.3 ; python_version >= "3.10" and python_version < "4.0" -langchain-text-splitters==0.3.0 ; python_version >= "3.10" and python_version < "4.0" -langchain==0.3.4 ; python_version >= "3.10" and python_version < "4.0" -langsmith==0.1.137 ; python_version >= "3.10" and python_version < "4.0" +langchain-community==0.3.13 ; python_version >= "3.10" and python_version < "4.0" +langchain-core==0.3.28 ; python_version >= "3.10" and python_version < "4.0" +langchain-openai==0.2.14 ; python_version >= "3.10" and python_version < "4.0" +langchain-text-splitters==0.3.4 ; python_version >= "3.10" and python_version < "4.0" +langchain==0.3.13 ; python_version >= "3.10" and python_version < "4.0" +langsmith==0.1.147 ; python_version >= "3.10" and python_version < "4.0" marshmallow==3.22.0 ; python_version >= "3.10" and python_version < "4.0" multidict==6.1.0 ; python_version >= "3.10" and python_version < "4.0" mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0" numpy==1.26.4 ; python_version >= "3.10" and python_version < "4.0" -openai==1.52.0 ; python_version >= "3.10" and python_version < "4.0" -orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" +openai==1.58.1 ; python_version >= "3.10" and python_version < "4.0" +orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" packaging==24.1 ; python_version >= "3.10" and python_version < "4.0" -primp==0.6.4 ; python_version >= "3.10" and python_version < "4.0" +primp==0.8.1 ; python_version >= "3.10" and python_version < "4.0" propcache==0.2.0 ; python_version >= "3.10" and python_version < "4.0" pydantic-core==2.23.4 ; python_version >= "3.10" and python_version < "4.0" pydantic-settings==2.5.2 ; python_version >= "3.10" and python_version < "4.0" diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index e8be466..29a3111 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -81,15 +81,54 @@ def __init__( create_container=create_container, ) + def read_item( + self, + values: list[str] | None = None, + condition: dict[str, Any] | None = None, + ) -> list[dict[str, Any]]: + """条件を指定してdocumentを読み込む関数""" + logger.info("documentを読み込みます") + + query = "SELECT " + if values is not None: + query += ", ".join(["c." + value for value in values]) + " " + else: + query += "* " + query += "FROM c" + + parameters = [] + if condition is not None: + query += " WHERE" + for key, value in condition.items(): + name = key if "." not in key else key.replace(".", "_") + query += f" c.{key} = @{name}" + parameters.append({"name": f"@{name}", "value": value}) + query += " AND" + query = query[:-4] + + item = list(self._container.query_items( + query=query, + parameters=parameters if parameters else None, + enable_cross_partition_query=True + )) + + if not item: + logger.error(f"{id=}のdocumentが見つかりませんでした") + raise ValueError("documentが見つかりませんでした") + return item + def create_document( self, text: str, - text_type: Literal["markdown", "plain"] = "markdown" + text_type: Literal["markdown", "plain"] = "markdown", + title: str | None = None, + metadata: dict[str, Any] | None = None, ) -> list[str]: """データベースに新しいdocumentを作成する関数""" logger.info("新しいdocumentを作成します") texts, metadatas = self._division_document( - md_formatter(text) if text_type == "markdown" else text_formatter(text) + md_formatter(text, title, metadata) if text_type == "markdown" + else text_formatter(text, title=title, metadata=metadata) ) ids = self._insert_texts(texts, metadatas) return ids @@ -108,36 +147,147 @@ def _division_document( def update_document( self, id: str, - text: str, - ) -> str: + text: str | None = None, + text_type: Literal["markdown", "plain"] | None = None, + title: str | None = None, + metadata: dict[str, Any] | None = None, + del_metadata: list[str] | None = None, + is_patch: bool = False, + ) -> list[str]: """データベースのdocumentを更新する関数""" logger.info("documentを更新します") + result = [id] + item = self.read_item(values=["text", "metadata"], condition={"id": id})[0] - # metadataのupdated_atを更新 - query = "SELECT c.metadata FROM c WHERE c.id = @id" - parameters = [{"name": "@id", "value": id}] + if title is not None: + self._title_updater(id, title, item["metadata"].get("group_id", None)) - try: - item = self._container.query_items( - query=query, - parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト - enable_cross_partition_query=True - ).next() - except StopIteration: - logger.error(f"{id=}のdocumentが見つかりませんでした") - return "documentが見つかりませんでした" + if metadata is not None: + self._metadata_updater( + id, metadata, del_metadata, None if is_patch else item["metadata"].get("group_id", None) + ) - metadata = item["metadata"] - metadata["updated_at"] = datetime.now().strftime("%Y-%m-%d") + if text is not None: + if text_type is None: + raise TypeError("textを更新する際はtext_typeを指定してください。") + result = self._update_text( + id, text, text_type, item["metadata"].get("group_id", None) + ) - to_upsert = { - "id": id, - "text": text, - self._embedding_key: self._embedding.embed_documents([text])[0], - "metadata": metadata, - } - self._container.upsert_item(body=to_upsert) - return id + if any([title, metadata, del_metadata]): + date = datetime.now().strftime("%Y-%m-%d") + patch = [{ + "op": "replace", + "path": "/metadata/updated_at", + "value": date + }] + for _id in result: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + + return result + + def _title_updater(self, id: str, title: str, group_id: str | None = None) -> None: + """titleを更新する関数""" + if group_id is None: + ids = [id] + else: + data = self.read_item(values=["id"], condition={"metadata.group_id": group_id}) + ids = [cast(str, d["id"]) for d in data] + + patch = [{ + "op": "replace", + "path": "/metadata/title", + "value": title + }] + for _id in ids: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + + def _metadata_updater( + self, + id: str, + metadata: dict[str, Any], + del_metadata: list[str] | None = None, + group_id: str | None = None, + ) -> None: + """metadataを更新する関数""" + if group_id is None: + data = self.read_item(values=["metadata"], condition={"id": id})[0] + prev_metadatas = [cast(dict[str, Any], data["metadata"])] + ids = [id] + else: + datas = self.read_item(values=["id", "metadata"], condition={"metadata.group_id": group_id}) + prev_metadatas = [cast(dict[str, Any], d["metadata"]) for d in datas] + ids = [cast(str, d["id"]) for d in datas] + + for _id, pm in zip(ids, prev_metadatas, strict=True): + patch = self._create_patch(pm, metadata, [] if del_metadata is None else del_metadata) + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + + def _create_patch( + self, + prev_metadata: dict[str, Any], + new_metadata: dict[str, Any], + del_metadata: list[str], + ) -> list[dict[str, Any]]: + """metadataの差分を取得しパッチ操作を定義する関数""" + patch = [] + for dm in del_metadata: + if dm in new_metadata: + raise ValueError(f"metadata:{dm}は新しいmetadataに含まれています") + if dm in prev_metadata: + patch.append({ + "op": "remove", + "path": f"/metadata/{dm}" + }) + + for key, value in new_metadata.items(): + if key not in prev_metadata: + patch.append({ + "op": "add", + "path": f"/metadata/{key}", + "value": value + }) + elif prev_metadata[key] != value: + patch.append({ + "op": "replace", + "path": f"/metadata/{key}", + "value": value + }) + return patch + + def _update_text( + self, + id: str, + text: str, + text_type: Literal["markdown", "plain"], + group_id: str | None = None, + ) -> list[str]: + """textを更新する関数""" + created_at = self.read_item(values=["metadata.created_at"], condition={"id": id})[0]["created_at"] + if group_id is None: + self.delete_document_by_id(id) + else: + data = self.read_item(values=["id"], condition={"metadata.group_id": group_id}) + for d in data: + self.delete_document_by_id(d["id"]) + + ids = self.create_document(text, text_type) + patch = [{ + "op": "replace", + "path": "/metadata/created_at", + "value": created_at + }] + for _id in ids: + self._container.patch_item( + item=_id, partition_key=_id, patch_operations=patch + ) + return ids def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" @@ -158,16 +308,12 @@ def read_all_documents(self) -> list[Document]: def get_source_by_id(self, id: str) -> str: """idを指定してsourceを取得する関数""" logger.info(f"{id=}のsourceを取得します") - query = "SELECT c.text FROM c WHERE c.id = " + f"'{id}'" - item = self._container.query_items( - query=query, enable_cross_partition_query=True - ).next() - - result = item["text"] - if type(result) is str: - return result - else: - return "sourceが見つかりませんでした" + try: + item = self.read_item(values=["text"], condition={"id": id}) + except ValueError: + return "documentが見つかりませんでした" + result = item[0]["text"] + return cast(str, result) if __name__ == "__main__": @@ -177,9 +323,9 @@ def get_source_by_id(self, id: str) -> str: cosmos_manager = CosmosDBManager() query = "京都テック" # results = cosmos_manager.read_all_documents() - results = cosmos_manager.similarity_search(query, k=1) - print(results[0]) - print(results[0].metadata["id"]) + # results = cosmos_manager.similarity_search(query, k=1) + # print(results[0]) + # print(results[0].metadata["id"]) # # idで指定したドキュメントのsourceを取得 # ids = results[0].metadata["id"] @@ -187,8 +333,20 @@ def get_source_by_id(self, id: str) -> str: # doc = cosmos_manager.get_source_by_id(ids) # print(doc) -# # documentを更新 -# text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 -# エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" -# _id = "c55bb571-498a-4db9-9da0-e9e35d46906b" + # documentを更新 + text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 +エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" +# _id = "989af836-cf9b-44c7-93d2-deff7aeae51f" # print(cosmos_manager.update_document(_id, text)) + + + cosmos_manager.update_document( + id="98941def-479c-4292-ad68-1d6dd9f4800e", + text=text, + text_type="markdown", + ) + cosmos_manager.update_document( + id="98941def-479c-4292-ad68-1d6dd9f4800e", + text=text, + text_type="markdown", + ) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index 2b27712..d169926 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -1,6 +1,7 @@ import re from datetime import datetime from typing import Any +from uuid import uuid4 from langchain_core.documents import Document from langchain_text_splitters import ( @@ -97,79 +98,81 @@ def add_metadata( source (str, optional): ソース. with_timestamp (bool, optional): タイムスタンプの有無. Defaults to True. with_section_number (bool, optional): セクション番号の有無. Defaults to False. + **kwargs: その他のメタデータ. """ - i = 1 - date = datetime.now().strftime("%Y-%m-%d") - for doc in documents: - doc.metadata["title"] = title - - if source is not None and \ - doc.metadata.get("source") is None: - doc.metadata["source"] = source - - if with_timestamp and \ - doc.metadata.get("created_at") is None: - doc.metadata["created_at"] = date - doc.metadata["updated_at"] = date - - if with_section_number and \ - doc.metadata.get("section_number") is None: - doc.metadata["section_number"] = i - i += 1 - - for key, value in kwargs.items(): - doc.metadata[key] = value + m: dict[str, Any] = { + "title": title, **kwargs + } + + if source is not None: + m["source"] = source + if with_timestamp: + date = datetime.now().strftime("%Y-%m-%d") + m["created_at"] = date + m["updated_at"] = date + doc_id = str(uuid4()) + + return [ + _add_metadata( + doc, + {**m, "section_number": i, "group_id": doc_id} + if with_section_number else m + ) + for i, doc in enumerate(documents, start=1) + ] - return documents +def _add_metadata( + document: Document, + metadata: dict[str, Any] +) -> Document: + """メタデータを追加する関数 + Args: + document (Document): ドキュメント + metadata (dict[str, Any]): メタデータ. + """ + for key, value in metadata.items(): + document.metadata[key] = value + return document def md_formatter( text: str, - chunk_size: int = CHUNK_SIZE, - chunk_overlap: int = CHUNK_OVERLAP, - **kwargs: Any + title: str | None = None, + metadata: dict[str, Any] | None = None, ) -> list[Document]: """Markdown形式のテキストをフォーマットする関数 Args: text (str): Markdown形式のテキスト + title (str, optional): タイトル. + metadata (dict[str, Any], optional): メタデータ. chunk_size (int, optional): 分割するサイズ. chunk_overlap (int, optional): オーバーラップのサイズ. chunk_sizeを超えるテキストは再分割し、メタデータにセクション番号を付与します. """ - formatted_docs: list[Document] = [] - for doc in markdown_splitter(text): - t = _find_header(doc) - if len(doc.page_content) > chunk_size: - rdocs = recursive_document_splitter( - [doc], - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - formatted_docs += add_metadata( - rdocs, - title=t if t is not None else rdocs[0].page_content, - with_section_number=True, - **kwargs - ) - else: - formatted_docs += add_metadata( - [doc], - title=t if t is not None else doc.page_content, - **kwargs - ) - + docs = markdown_splitter(text) + _metadata = metadata if metadata is not None else {} + t = _find_header(docs[0]) if title is None else title + formatted_docs = add_metadata( + docs, + title=t if t is not None else docs[0].page_content, + with_section_number=True if len(docs) > 1 else False, + **_metadata, + ) return formatted_docs def text_formatter( text: str, separator: str = "\n\n", + title: str | None = None, + metadata: dict[str, Any] | None = None, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, - **kwargs: Any ) -> list[Document]: """テキストをフォーマットする関数 Args: text (str): テキスト + title (str, optional): タイトル. + metadata (dict[str, Any], optional): メタデータ. separator (str, optional): 区切り文字. chunk_size (int, optional): 分割するサイズ. chunk_overlap (int, optional): オーバーラップのサイズ. @@ -184,9 +187,9 @@ def text_formatter( ) return add_metadata( docs, - title=docs[0].page_content, - with_section_number=True, - **kwargs + title=docs[0].page_content if title is None else title, + with_section_number=True if len(docs) > 1 else False, + **metadata if metadata is not None else {}, ) if __name__ == "__main__": @@ -219,8 +222,8 @@ def print_docs(docs: list[Document]) -> None: print() - docs = md_formatter(md_text) + docs = md_formatter(md_text, title="hogehogehoge", metadata={"fuga": "piyopiyo"}) print_docs(docs) - docs = text_formatter(md_text) + docs = text_formatter(md_text, title="hogehogehoge", metadata={"fuga": "piyopiyo"}) print_docs(docs)