diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 0000000..e43906c --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,19 @@ +# Contributing to nano-graphrag + +### Submit your Contribution through PR + +To make a contribution, follow these steps: + +1. Fork and clone this repository +3. If you modified the core code (`./nano_graphrag`), please add tests for it +4. **Include proper documentation / docstring or examples** +5. Ensure that all tests pass by running `pytest` +6. Submit a pull request + +For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). + + + +### Only add a dependency when we have to + +`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function. \ No newline at end of file diff --git a/examples/using_custom_chunking_method.py b/examples/using_custom_chunking_method.py new file mode 100644 index 0000000..cd3f757 --- /dev/null +++ b/examples/using_custom_chunking_method.py @@ -0,0 +1,68 @@ + + +from nano_graphrag._utils import encode_string_by_tiktoken +from nano_graphrag.base import QueryParam +from nano_graphrag.graphrag import GraphRAG + + +def chunking_by_specific_separators( + content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", +): + from langchain_text_splitters import RecursiveCharacterTextSplitter + + + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size, + chunk_overlap=overlap_token_size, + # length_function=lambda x: len(encode_string_by_tiktoken(x)), + model_name=tiktoken_model, + is_separator_regex=False, + separators=[ + # Paragraph separators + "\n\n", + "\r\n\r\n", + # Line breaks + "\n", + "\r\n", + # Sentence ending punctuation + "。", # Chinese period + ".", # Full-width dot + ".", # English period + "!", # Chinese exclamation mark + "!", # English exclamation mark + "?", # Chinese question mark + "?", # English question mark + # Whitespace characters + " ", # Space + "\t", # Tab + "\u3000", # Full-width space + # Special characters + "\u200b", # Zero-width space (used in some Asian languages) + # Final fallback + "", + ]) + texts = text_splitter.split_text(content) + + results = [] + for index, chunk_content in enumerate(texts): + + results.append( + { + # "tokens": None, + "content": chunk_content.strip(), + "chunk_order_index": index, + } + ) + return results + + +WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST" +rag = GraphRAG( + working_dir=WORKING_DIR, + chunk_func=chunking_by_specific_separators, +) + +with open("../tests/mock_data.txt", encoding="utf-8-sig") as f: + FAKE_TEXT = f.read() + +# rag.insert(FAKE_TEXT) +print(rag.query("What the main theme of this story?", param=QueryParam(mode="local"))) diff --git a/nano_graphrag/_op.py b/nano_graphrag/_op.py index 691e5d8..f6feb01 100644 --- a/nano_graphrag/_op.py +++ b/nano_graphrag/_op.py @@ -14,7 +14,7 @@ list_of_list_to_csv, pack_user_ass_to_openai_messages, split_string_by_multi_markers, - truncate_list_by_token_size + truncate_list_by_token_size, ) from .base import ( BaseGraphStorage, @@ -80,7 +80,7 @@ async def _handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"entity"' or len(record_attributes) < 4: + if len(record_attributes) < 4 or record_attributes[0] != '"entity"': return None # add this record as a node in the G entity_name = clean_str(record_attributes[1].upper()) @@ -101,7 +101,7 @@ async def _handle_single_relationship_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: + if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': return None # add this record as edge source = clean_str(record_attributes[1].upper()) @@ -213,10 +213,7 @@ async def _merge_edges_then_upsert( src_id, tgt_id, edge_data=dict( - weight=weight, - description=description, - source_id=source_id, - order=order + weight=weight, description=description, source_id=source_id, order=order ), ) diff --git a/nano_graphrag/graphrag.py b/nano_graphrag/graphrag.py index 53d14fd..3cf938a 100644 --- a/nano_graphrag/graphrag.py +++ b/nano_graphrag/graphrag.py @@ -3,7 +3,7 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast +from typing import Callable, Dict, List, Optional, Type, Union, cast from ._llm import ( @@ -65,6 +65,7 @@ class GraphRAG: enable_naive_rag: bool = False # text chunking + chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size chunk_token_size: int = 1200 chunk_overlap_token_size: int = 100 tiktoken_model_name: str = "gpt-4o" @@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings): **dp, "full_doc_id": doc_key, } - for dp in chunking_by_token_size( + for dp in self.chunk_func( doc["content"], overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, diff --git a/readme.md b/readme.md index cc7ffde..70a9ae5 100644 --- a/readme.md +++ b/readme.md @@ -343,6 +343,12 @@ See [ROADMAP.md](./docs/ROADMAP.md) +## Contribute + +`nano-graphrag` is open to any kind of contribution. Read [this](./docs/CONTRIBUTING.md) before you contribute. + + + ## Benchmark