From d38e20dc4ead059d77998ece7c26da7d12c075d9 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 10 Dec 2024 13:56:11 +0000 Subject: [PATCH 01/14] =?UTF-8?q?=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=82=92=E3=83=95=E3=82=A9=E3=83=BC=E3=83=9E=E3=83=83=E3=83=88?= =?UTF-8?q?=E3=81=99=E3=82=8B=E9=96=A2=E6=95=B0=E3=82=92=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../template/document_formatter.py | 231 ++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 src/sc_system_ai/template/document_formatter.py diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py new file mode 100644 index 0000000..121ec18 --- /dev/null +++ b/src/sc_system_ai/template/document_formatter.py @@ -0,0 +1,231 @@ +from datetime import datetime +from typing import Any + +from langchain_core.documents import Document +from langchain_text_splitters import ( + CharacterTextSplitter, + MarkdownHeaderTextSplitter, + RecursiveCharacterTextSplitter, +) + +CHUNK_SIZE = 10 +CHUNK_OVERLAP = 5 + +def _max_level(text: str) -> int: + """Markdownのヘッダーの最大レベルを返す関数""" + level = 0 + for line in text.split("\n"): + if line.startswith("#"): + counter = 0 + while line[counter] == "#": + counter += 1 + level = max(level, counter) + return level + +def markdown_splitter( + text: str, +) -> list[Document]: + """Markdownをヘッダーで分割する関数""" + headers_to_split_on = [ + ("#" * (i + 1), f"Header {i + 1}") + for i in range(_max_level(text)) + ] + splitter = MarkdownHeaderTextSplitter( + headers_to_split_on, + return_each_line=True, + ) + return splitter.split_text(text) + +def _find_header(document: Document) -> str | None: + """ドキュメントのヘッダー名を返す関数""" + i = 0 + while True: + if document.metadata.get(f"Header {i + 1}") is None: + break + i += 1 + return document.metadata[f"Header {i}"] if i != 0 else None + +def recursive_document_splitter( + documents: list[Document], + chunk_size: int, + chunk_overlap: int, +) -> list[Document]: + """再帰的に分割する関数""" + splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + return splitter.split_documents(documents) + +def document_splitter( + documents: Document | list[Document], + separator: str = "\n\n", + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = CHUNK_OVERLAP, + ) -> list[Document]: + """Documentを分割する関数""" + _documents = documents if isinstance(documents, list) else [documents] + splitter = CharacterTextSplitter( + separator=separator, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + return splitter.split_documents(_documents) + +def character_splitter( + text: str, + separator: str = "\n\n", + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = CHUNK_OVERLAP, + ) -> list[Document]: + """文字列を分割する関数""" + character_splitter = CharacterTextSplitter( + separator=separator, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + splitted_text = character_splitter.split_text(text) + return character_splitter.create_documents(splitted_text) + +def add_metadata( + documents: list[Document], + title: str, + source: str | None = None, + with_timestamp: bool = True, + with_section_number: bool = False, + **kwargs: Any +) -> list[Document]: + """メタデータを追加する関数 + Args: + documents (list[Document]): ドキュメントのリスト + title (str): タイトル + source (str, optional): ソース. + with_timestamp (bool, optional): タイムスタンプの有無. Defaults to True. + with_section_number (bool, optional): セクション番号の有無. Defaults to False. + """ + i = 1 + date = datetime.now().strftime("%Y-%m-%d") + for doc in documents: + doc.metadata["title"] = title + + if source is not None and \ + doc.metadata.get("source") is None: + doc.metadata["source"] = source + + if with_timestamp and \ + doc.metadata.get("created_at") is None: + doc.metadata["created_at"] = date + doc.metadata["updated_at"] = date + + if with_section_number and \ + doc.metadata.get("section_number") is None: + doc.metadata["section_number"] = i + i += 1 + + for key, value in kwargs.items(): + doc.metadata[key] = value + + return documents + +def md_formatter( + text: str, + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = CHUNK_OVERLAP, + **kwargs: Any +) -> list[Document]: + """Markdown形式のテキストをフォーマットする関数 + Args: + text (str): Markdown形式のテキスト + chunk_size (int, optional): 分割するサイズ. + chunk_overlap (int, optional): オーバーラップのサイズ. + + chunk_sizeを超えるテキストは再分割し、メタデータにセクション番号を付与します. + """ + formatted_docs: list[Document] = [] + for doc in markdown_splitter(text): + t = _find_header(doc) + if len(doc.page_content) > chunk_size: + rdocs = recursive_document_splitter( + [doc], + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + formatted_docs += add_metadata( + rdocs, + title=t if t is not None else rdocs[0].page_content, + with_section_number=True, + **kwargs + ) + else: + formatted_docs += add_metadata( + [doc], + title=t if t is not None else doc.page_content, + **kwargs + ) + + return formatted_docs + +def text_formatter( + text: str, + separator: str = "\n\n", + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = CHUNK_OVERLAP, + **kwargs: Any +) -> list[Document]: + """テキストをフォーマットする関数 + Args: + text (str): テキスト + separator (str, optional): 区切り文字. + chunk_size (int, optional): 分割するサイズ. + chunk_overlap (int, optional): オーバーラップのサイズ. + + セパレータとチャンクサイズでテキストを分割し、メタデータにセクション番号を付与します. + """ + docs = character_splitter( + text, + separator=separator, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + return add_metadata( + docs, + title=docs[0].page_content, + with_section_number=True, + **kwargs + ) + +if __name__ == "__main__": + md_text = """ +# Sample Markdown +This is a sample markdown text. + + +## piyo +There is section 2. +### fuga +but, there is section 3. + + +## Are you ...? +Are you hogehoge? + + +### negative answer +No, I'm fugafuga. + + +### positive answer +Yes, I'm hogehoge. +""" + def print_docs(docs: list[Document]) -> None: + for doc in docs: + print(doc.page_content) + print(doc.metadata) + print() + + + docs = md_formatter(md_text) + print_docs(docs) + + docs = text_formatter(md_text) + print_docs(docs) From 313a6c437deb3b7c38a2be6586969d3ddc46e559 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 18 Dec 2024 13:13:49 +0000 Subject: [PATCH 02/14] =?UTF-8?q?id=E3=82=92metadata=E3=81=AB=E5=90=AB?= =?UTF-8?q?=E3=82=81=E3=82=8B=E5=87=A6=E7=90=86=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 82 +++++++++++++++---- .../template/document_formatter.py | 4 +- 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 4911287..ee22cd3 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -79,21 +79,74 @@ def __init__( create_container=create_container, ) - def read_all_documents(self) -> list[Document]: - """全てのdocumentsを読み込む関数""" + def _similarity_search_with_score( + self, + embeddings: list[float], + k: int = 1, + pre_filter: dict | None = None, + with_embedding: bool = False + ) -> list[tuple[Document, float]]: + query = "SELECT " + + # If limit_offset_clause is not specified, add TOP clause + if pre_filter is None or pre_filter.get("limit_offset_clause") is None: + query += "TOP @limit " + + query += ( + "c.id, c[@embeddingKey], c.text, c.metadata, " + "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" + ) + + # Add where_clause if specified + if pre_filter is not None and pre_filter.get("where_clause") is not None: + query += " {}".format(pre_filter["where_clause"]) + + query += " ORDER BY VectorDistance(c[@embeddingKey], @embeddings)" + + # Add limit_offset_clause if specified + if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None: + query += " {}".format(pre_filter["limit_offset_clause"]) + parameters = [ + {"name": "@limit", "value": k}, + {"name": "@embeddingKey", "value": self._embedding_key}, + {"name": "@embeddings", "value": embeddings}, + ] + + docs_and_scores = [] + + items = list( + self._container.query_items( + query=query, parameters=parameters, enable_cross_partition_query=True + ) + ) + for item in items: + text = item["text"] + metadata = item["metadata"] + + # idをmetadataに追加 + metadata["id"] = item["id"] + + score = item["SimilarityScore"] + if with_embedding: + metadata[self._embedding_key] = item[self._embedding_key] + docs_and_scores.append( + (Document(page_content=text, metadata=metadata), score) + ) + return docs_and_scores + + def read_all_documents(self) -> list[dict[str, Any]]: + """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") query = "SELECT c.id, c.text FROM c" items = list(self._container.query_items( - query=query, enable_cross_partition_query=True)) - docs = [] - i = 1 + query=query, enable_cross_partition_query=True) + ) + docs: list[dict] = [] for item in items: + id = item["id"] text = item["text"] - item["number"] = i - i += 1 - docs.append( - Document(page_content=text, metadata=item)) - logger.debug(f"{docs[0].page_content=}, \n\nlength: {len(docs)}") + docs.append({"id": id, "texts": text}) + logger.debug(f"{docs[0]['id']}, \n\nlength: {len(docs)}") return docs def get_source_by_id(self, id: str) -> str: @@ -118,9 +171,10 @@ def get_source_by_id(self, id: str) -> str: # results = cosmos_manager.read_all_documents() results = cosmos_manager.similarity_search(query, k=1) print(results[0]) + print(results[0].metadata["id"]) # idで指定したドキュメントのsourceを取得 - ids = results[0].metadata["id"] - print(f"{ids=}") - doc = cosmos_manager.get_source_by_id(ids) - print(doc) + # ids = results[0].metadata["id"] + # print(f"{ids=}") + # doc = cosmos_manager.get_source_by_id(ids) + # print(doc) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index 121ec18..a714a4c 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -8,8 +8,8 @@ RecursiveCharacterTextSplitter, ) -CHUNK_SIZE = 10 -CHUNK_OVERLAP = 5 +CHUNK_SIZE = 1000 +CHUNK_OVERLAP = 50 def _max_level(text: str) -> int: """Markdownのヘッダーの最大レベルを返す関数""" From 263cb0eb5535ccf6bdbe4c0511de4acb8ceeebd6 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 18 Dec 2024 13:21:38 +0000 Subject: [PATCH 03/14] =?UTF-8?q?read=5Fall=5Fdocuments=E3=81=A7Document?= =?UTF-8?q?=E5=9E=8B=E3=82=92=E8=BF=94=E3=81=99=E3=82=88=E3=81=86=E3=81=AB?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index ee22cd3..c12a98d 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -134,19 +134,20 @@ def _similarity_search_with_score( ) return docs_and_scores - def read_all_documents(self) -> list[dict[str, Any]]: + def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") query = "SELECT c.id, c.text FROM c" items = list(self._container.query_items( query=query, enable_cross_partition_query=True) ) - docs: list[dict] = [] + docs: list[Document] = [] for item in items: - id = item["id"] text = item["text"] - docs.append({"id": id, "texts": text}) - logger.debug(f"{docs[0]['id']}, \n\nlength: {len(docs)}") + _id = item["id"] + docs.append( + Document(page_content=text, metadata={"id": _id}) + ) return docs def get_source_by_id(self, id: str) -> str: From 2fdecb47e250e4f5305d3d394cd00bf173d2910d Mon Sep 17 00:00:00 2001 From: haruki26 Date: Wed, 18 Dec 2024 13:54:37 +0000 Subject: [PATCH 04/14] =?UTF-8?q?get=5Fsource=5Fby=5Fid=E3=81=AE=E3=83=AA?= =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=AF=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index c12a98d..ed46f70 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -153,9 +153,12 @@ def read_all_documents(self) -> list[Document]: def get_source_by_id(self, id: str) -> str: """idを指定してsourceを取得する関数""" logger.info(f"{id=}のsourceを取得します") - item = self._container.read_item(item=id, partition_key=id) + query = "SELECT c.text FROM c WHERE c.id = " + f"'{id}'" + item = self._container.query_items( + query=query, enable_cross_partition_query=True + ).next() - result = item.get("source") + result = item["text"] if type(result) is str: return result else: @@ -175,7 +178,7 @@ def get_source_by_id(self, id: str) -> str: print(results[0].metadata["id"]) # idで指定したドキュメントのsourceを取得 - # ids = results[0].metadata["id"] - # print(f"{ids=}") - # doc = cosmos_manager.get_source_by_id(ids) - # print(doc) + ids = results[0].metadata["id"] + print(f"{ids=}") + doc = cosmos_manager.get_source_by_id(ids) + print(doc) From 38ad9808ef7c053fb82886dc7c15096398ded933 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Fri, 20 Dec 2024 14:35:13 +0000 Subject: [PATCH 05/14] =?UTF-8?q?=E3=83=87=E3=83=BC=E3=82=BF=E3=83=99?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E3=81=AB=E3=83=89=E3=82=AD=E3=83=A5=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88=E3=82=92=E4=BD=9C=E6=88=90=E3=81=99=E3=82=8B?= =?UTF-8?q?=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89=E3=81=AE=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 52 ++++++++++++++----- .../template/document_formatter.py | 2 +- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index ed46f70..87ee123 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any +from typing import Any, Literal from azure.cosmos import CosmosClient, PartitionKey from dotenv import load_dotenv @@ -11,6 +11,7 @@ from langchain_core.embeddings import Embeddings from sc_system_ai.template.ai_settings import embeddings +from sc_system_ai.template.document_formatter import md_formatter, text_formatter load_dotenv() @@ -134,6 +135,32 @@ def _similarity_search_with_score( ) return docs_and_scores + def create_document( + self, + text: str, + text_type: Literal["markdown", "plain"] = "markdown" + ) -> list[str]: + """データベースに新しいdocumentを作成する関数""" + logger.info("新しいdocumentを作成します") + texts, metadatas = self._division_document( + md_formatter(text) if text_type == "markdown" else text_formatter(text) + ) + ids = self._insert_texts(texts, metadatas) + return ids + + def _division_document( + self, + documents: list[Document] + ) -> tuple[list[str], list[dict[str, Any]]]: + """documentを分割する関数""" + logger.info("documentを分割します") + docs = [] + metadata = [] + for doc in documents: + docs.append(doc.page_content) + metadata.append(doc.metadata) + return docs, metadata + def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") @@ -165,20 +192,19 @@ def get_source_by_id(self, id: str) -> str: return "sourceが見つかりませんでした" - if __name__ == "__main__": from sc_system_ai.logging_config import setup_logging setup_logging() cosmos_manager = CosmosDBManager() - query = "京都テック" - # results = cosmos_manager.read_all_documents() - results = cosmos_manager.similarity_search(query, k=1) - print(results[0]) - print(results[0].metadata["id"]) - - # idで指定したドキュメントのsourceを取得 - ids = results[0].metadata["id"] - print(f"{ids=}") - doc = cosmos_manager.get_source_by_id(ids) - print(doc) + # query = "京都テック" + # # results = cosmos_manager.read_all_documents() + # results = cosmos_manager.similarity_search(query, k=1) + # print(results[0]) + # print(results[0].metadata["id"]) + + # # idで指定したドキュメントのsourceを取得 + # ids = results[0].metadata["id"] + # print(f"{ids=}") + # doc = cosmos_manager.get_source_by_id(ids) + # print(doc) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index a714a4c..204daa6 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -9,7 +9,7 @@ ) CHUNK_SIZE = 1000 -CHUNK_OVERLAP = 50 +CHUNK_OVERLAP = 200 def _max_level(text: str) -> int: """Markdownのヘッダーの最大レベルを返す関数""" From 113a799344c85fbe6ac2f969325842e006e38eae Mon Sep 17 00:00:00 2001 From: haruki26 Date: Fri, 20 Dec 2024 15:11:08 +0000 Subject: [PATCH 06/14] =?UTF-8?q?document=E3=82=92=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89=E3=82=92?= =?UTF-8?q?=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 87ee123..b51b361 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -1,5 +1,6 @@ import logging import os +from datetime import datetime from typing import Any, Literal from azure.cosmos import CosmosClient, PartitionKey @@ -161,6 +162,31 @@ def _division_document( metadata.append(doc.metadata) return docs, metadata + def update_document( + self, + id: str, + text: str, + ) -> str: + """データベースのdocumentを更新する関数""" + logger.info("documentを更新します") + + # metadataのupdated_atを更新 + query = "SELECT c.metadata FROM c WHERE c.id = " + f"'{id}'" + item = self._container.query_items( + query=query, enable_cross_partition_query=True + ).next() + metadata = item["metadata"] + metadata["updated_at"] = datetime.now().strftime("%Y-%m-%d") + + to_upsert = { + "id": id, + "text": text, + self._embedding_key: self._embedding.embed_documents([text])[0], + "metadata": metadata, + } + self._container.upsert_item(body=to_upsert) + return id + def read_all_documents(self) -> list[Document]: """全てのdocumentsとIDを読み込む関数""" logger.info("全てのdocumentsを読み込みます") @@ -208,3 +234,9 @@ def get_source_by_id(self, id: str) -> str: # print(f"{ids=}") # doc = cosmos_manager.get_source_by_id(ids) # print(doc) + + # documentを更新 + text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 +エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" + _id = "c55bb571-498a-4db9-9da0-e9e35d46906b" + print(cosmos_manager.update_document(_id, text)) From ac9d7653f8562bc189d5fcb2923c36d6fe4ff2ee Mon Sep 17 00:00:00 2001 From: snow7y Date: Mon, 23 Dec 2024 14:11:23 +0000 Subject: [PATCH 07/14] =?UTF-8?q?azure.core=E3=81=AElog=20level=20?= =?UTF-8?q?=E3=82=92warning=E4=BB=A5=E4=B8=8A=E3=81=AB=E8=A8=AD=E5=AE=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/logging_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sc_system_ai/logging_config.py b/src/sc_system_ai/logging_config.py index 082b2f2..52a8595 100644 --- a/src/sc_system_ai/logging_config.py +++ b/src/sc_system_ai/logging_config.py @@ -39,5 +39,9 @@ def setup_logging() -> None: package_logger = logging.getLogger("sc_system_ai") package_logger.setLevel(logging.DEBUG) + # azure.coreのログメッセージをWARNING以上で出力する + azure_logger = logging.getLogger("azure.core") + azure_logger.setLevel(logging.WARNING) + # langchainのログメッセージを出力する set_verbose(True) From af6652c227c1f494959eb47e310197d809fff877 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 24 Dec 2024 05:54:04 +0000 Subject: [PATCH 08/14] =?UTF-8?q?=5Fmax=5Flevel=E9=96=A2=E6=95=B0=E3=81=AE?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/document_formatter.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/sc_system_ai/template/document_formatter.py b/src/sc_system_ai/template/document_formatter.py index 204daa6..2b27712 100644 --- a/src/sc_system_ai/template/document_formatter.py +++ b/src/sc_system_ai/template/document_formatter.py @@ -1,3 +1,4 @@ +import re from datetime import datetime from typing import Any @@ -13,14 +14,8 @@ def _max_level(text: str) -> int: """Markdownのヘッダーの最大レベルを返す関数""" - level = 0 - for line in text.split("\n"): - if line.startswith("#"): - counter = 0 - while line[counter] == "#": - counter += 1 - level = max(level, counter) - return level + headers = re.findall(r"^#+", text, re.MULTILINE) + return max([len(h) for h in headers]) if headers else 0 def markdown_splitter( text: str, From 3e4227c0a11673a1766cd7551b561f56a7d49a99 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 24 Dec 2024 06:23:06 +0000 Subject: [PATCH 09/14] =?UTF-8?q?update=5Fdocument=E3=81=AE=E3=82=AF?= =?UTF-8?q?=E3=82=A8=E3=83=AA=E3=81=A7=E3=83=91=E3=83=A9=E3=83=A1=E3=83=BC?= =?UTF-8?q?=E3=82=BF=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=99=E3=82=8B=E3=82=88?= =?UTF-8?q?=E3=81=86=E3=81=AB=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index b51b361..60fe7fe 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -1,7 +1,7 @@ import logging import os from datetime import datetime -from typing import Any, Literal +from typing import Any, Literal, cast from azure.cosmos import CosmosClient, PartitionKey from dotenv import load_dotenv @@ -171,9 +171,12 @@ def update_document( logger.info("documentを更新します") # metadataのupdated_atを更新 - query = "SELECT c.metadata FROM c WHERE c.id = " + f"'{id}'" + query = "SELECT c.metadata FROM c WHERE c.id = @id" + parameters = [{"name": "@id", "value": id}] item = self._container.query_items( - query=query, enable_cross_partition_query=True + query=query, + parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト + enable_cross_partition_query=True ).next() metadata = item["metadata"] metadata["updated_at"] = datetime.now().strftime("%Y-%m-%d") From 81d49228ff9df5d970782e4fa7099bcf994d3f35 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Tue, 24 Dec 2024 06:31:10 +0000 Subject: [PATCH 10/14] =?UTF-8?q?document=E3=81=8C=E8=A6=8B=E3=81=A4?= =?UTF-8?q?=E3=81=8B=E3=82=89=E3=81=AA=E3=81=84=E6=99=82=E3=81=AE=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 60fe7fe..7b03c94 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -173,11 +173,17 @@ def update_document( # metadataのupdated_atを更新 query = "SELECT c.metadata FROM c WHERE c.id = @id" parameters = [{"name": "@id", "value": id}] - item = self._container.query_items( - query=query, - parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト - enable_cross_partition_query=True - ).next() + + try: + item = self._container.query_items( + query=query, + parameters=cast(list[dict[str, Any]], parameters), # mypyがエラー吐くのでキャスト + enable_cross_partition_query=True + ).next() + except StopIteration: + logger.error(f"{id=}のdocumentが見つかりませんでした") + return "documentが見つかりませんでした" + metadata = item["metadata"] metadata["updated_at"] = datetime.now().strftime("%Y-%m-%d") From 0f5c59023e3265e961e926d37c3781ffce0c73b5 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sat, 28 Dec 2024 08:20:02 +0000 Subject: [PATCH 11/14] =?UTF-8?q?enumerate=E3=82=92=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/agents/tools/search_school_data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sc_system_ai/agents/tools/search_school_data.py b/src/sc_system_ai/agents/tools/search_school_data.py index 1e98385..f02d177 100644 --- a/src/sc_system_ai/agents/tools/search_school_data.py +++ b/src/sc_system_ai/agents/tools/search_school_data.py @@ -54,13 +54,12 @@ def _run( """use the tool.""" logger.info(f"Search School Data Toolが次の値で呼び出されました: {search_word}") result = search_school_database_cosmos(search_word) - i = 1 search_result = [] - for doc in result: + for i, doc in enumerate(result): if hasattr(doc, 'page_content'): search_result.append( - f'・検索結果{i}は以下の通りです。\n{doc.page_content}\n参考URL: "{doc.metadata["source"]}"\n\n') - i += 1 + f'・検索結果{i + 1}は以下の通りです。\n{doc.page_content}\n参考URL: "{doc.metadata["id"]}"\n\n' + ) return search_result From 5e7d95cdfe0dd27cf9d8fbea2a5e8083af2d3cd1 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Sun, 5 Jan 2025 23:23:38 +0000 Subject: [PATCH 12/14] =?UTF-8?q?=E6=A4=9C=E7=B4=A2=E7=94=A8=E3=81=AE?= =?UTF-8?q?=E9=96=A2=E6=95=B0=E3=82=92=E3=82=B7=E3=83=B3=E3=83=97=E3=83=AB?= =?UTF-8?q?=E3=81=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/agents/tools/search_school_data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/sc_system_ai/agents/tools/search_school_data.py b/src/sc_system_ai/agents/tools/search_school_data.py index f02d177..fdea2e1 100644 --- a/src/sc_system_ai/agents/tools/search_school_data.py +++ b/src/sc_system_ai/agents/tools/search_school_data.py @@ -31,10 +31,6 @@ def search_school_database_cosmos(search_word: str, top_k: int = 2) -> list[Docu """学校に関する情報を検索する関数(現在のデータベースを参照)""" cosmos_manager = CosmosDBManager() docs = cosmos_manager.similarity_search(search_word, k=top_k) - - for doc in docs: - source = cosmos_manager.get_source_by_id(doc.metadata["id"]) - doc.metadata["source"] = source return docs From a27e69d7ab769f3549b689cf9c97f2405a240c0f Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 6 Jan 2025 00:14:38 +0000 Subject: [PATCH 13/14] =?UTF-8?q?=E3=83=98=E3=83=AB=E3=83=91=E3=83=BC?= =?UTF-8?q?=E9=96=A2=E6=95=B0=E3=81=AE=E3=83=AD=E3=82=AC=E3=83=BC=E3=82=92?= =?UTF-8?q?=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sc_system_ai/template/azure_cosmos.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 7b03c94..6910832 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -154,9 +154,7 @@ def _division_document( documents: list[Document] ) -> tuple[list[str], list[dict[str, Any]]]: """documentを分割する関数""" - logger.info("documentを分割します") - docs = [] - metadata = [] + docs, metadata = [], [] for doc in documents: docs.append(doc.page_content) metadata.append(doc.metadata) From 1c09be0f0a1787f32e11b32f4783d3225bb5a5c3 Mon Sep 17 00:00:00 2001 From: haruki26 Date: Mon, 6 Jan 2025 00:52:25 +0000 Subject: [PATCH 14/14] =?UTF-8?q?id=E5=8F=96=E5=BE=97=E3=81=AE=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit langchain側でデフォルトで取得可能になったため --- src/sc_system_ai/template/azure_cosmos.py | 75 +++-------------------- 1 file changed, 10 insertions(+), 65 deletions(-) diff --git a/src/sc_system_ai/template/azure_cosmos.py b/src/sc_system_ai/template/azure_cosmos.py index 6910832..e8be466 100644 --- a/src/sc_system_ai/template/azure_cosmos.py +++ b/src/sc_system_ai/template/azure_cosmos.py @@ -81,61 +81,6 @@ def __init__( create_container=create_container, ) - def _similarity_search_with_score( - self, - embeddings: list[float], - k: int = 1, - pre_filter: dict | None = None, - with_embedding: bool = False - ) -> list[tuple[Document, float]]: - query = "SELECT " - - # If limit_offset_clause is not specified, add TOP clause - if pre_filter is None or pre_filter.get("limit_offset_clause") is None: - query += "TOP @limit " - - query += ( - "c.id, c[@embeddingKey], c.text, c.metadata, " - "VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" - ) - - # Add where_clause if specified - if pre_filter is not None and pre_filter.get("where_clause") is not None: - query += " {}".format(pre_filter["where_clause"]) - - query += " ORDER BY VectorDistance(c[@embeddingKey], @embeddings)" - - # Add limit_offset_clause if specified - if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None: - query += " {}".format(pre_filter["limit_offset_clause"]) - parameters = [ - {"name": "@limit", "value": k}, - {"name": "@embeddingKey", "value": self._embedding_key}, - {"name": "@embeddings", "value": embeddings}, - ] - - docs_and_scores = [] - - items = list( - self._container.query_items( - query=query, parameters=parameters, enable_cross_partition_query=True - ) - ) - for item in items: - text = item["text"] - metadata = item["metadata"] - - # idをmetadataに追加 - metadata["id"] = item["id"] - - score = item["SimilarityScore"] - if with_embedding: - metadata[self._embedding_key] = item[self._embedding_key] - docs_and_scores.append( - (Document(page_content=text, metadata=metadata), score) - ) - return docs_and_scores - def create_document( self, text: str, @@ -230,11 +175,11 @@ def get_source_by_id(self, id: str) -> str: setup_logging() cosmos_manager = CosmosDBManager() - # query = "京都テック" - # # results = cosmos_manager.read_all_documents() - # results = cosmos_manager.similarity_search(query, k=1) - # print(results[0]) - # print(results[0].metadata["id"]) + query = "京都テック" + # results = cosmos_manager.read_all_documents() + results = cosmos_manager.similarity_search(query, k=1) + print(results[0]) + print(results[0].metadata["id"]) # # idで指定したドキュメントのsourceを取得 # ids = results[0].metadata["id"] @@ -242,8 +187,8 @@ def get_source_by_id(self, id: str) -> str: # doc = cosmos_manager.get_source_by_id(ids) # print(doc) - # documentを更新 - text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 -エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" - _id = "c55bb571-498a-4db9-9da0-e9e35d46906b" - print(cosmos_manager.update_document(_id, text)) +# # documentを更新 +# text = """ストリーミングレスポンスに対応するためにジェネレータとして定義されています。 +# エージェントが回答の生成を終えてからレスポンスを受け取ることも可能です。""" +# _id = "c55bb571-498a-4db9-9da0-e9e35d46906b" +# print(cosmos_manager.update_document(_id, text))