Merge branch 'main' of https://github.com/gusye1234/nano-graphrag

Ashes47 · Sep 18, 2024 · 4e3fb7d · 4e3fb7d
2 parents 0a0101f + f11e9f2
commit 4e3fb7d
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 9 deletions.
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -0,0 +1,19 @@
+# Contributing to nano-graphrag
+
+### Submit your Contribution through PR
+
+To make a contribution, follow these steps:
+
+1. Fork and clone this repository
+3. If you modified the core code (`./nano_graphrag`), please add tests for it
+4. **Include proper documentation / docstring or examples**
+5. Ensure that all tests pass by running `pytest`
+6. Submit a pull request
+
+For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
+
+
+
+### Only add a dependency when we have to
+
+`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function.
diff --git a/examples/using_custom_chunking_method.py b/examples/using_custom_chunking_method.py
@@ -0,0 +1,68 @@
+
+
+from nano_graphrag._utils import encode_string_by_tiktoken
+from nano_graphrag.base import QueryParam
+from nano_graphrag.graphrag import GraphRAG
+
+
+def chunking_by_specific_separators(
+    content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o",
+):
+    from langchain_text_splitters  import RecursiveCharacterTextSplitter
+
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size,
+        chunk_overlap=overlap_token_size,
+        # length_function=lambda x: len(encode_string_by_tiktoken(x)),
+        model_name=tiktoken_model,
+        is_separator_regex=False,
+        separators=[
+            # Paragraph separators
+            "\n\n",
+            "\r\n\r\n",
+            # Line breaks
+            "\n",
+            "\r\n",
+            # Sentence ending punctuation
+            "。",  # Chinese period
+            "．",  # Full-width dot
+            ".",  # English period
+            "！",  # Chinese exclamation mark
+            "!",  # English exclamation mark
+            "？",  # Chinese question mark
+            "?",  # English question mark
+            # Whitespace characters
+            " ",  # Space
+            "\t",  # Tab
+            "\u3000",  # Full-width space
+            # Special characters
+            "\u200b",  # Zero-width space (used in some Asian languages)
+            # Final fallback
+            "",
+        ])
+    texts = text_splitter.split_text(content)
+
+    results = []
+    for index, chunk_content in enumerate(texts):
+
+        results.append(
+            {
+                # "tokens": None,
+                "content": chunk_content.strip(),
+                "chunk_order_index": index,
+            }
+        )
+    return results
+
+
+WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
+rag = GraphRAG(
+    working_dir=WORKING_DIR,
+    chunk_func=chunking_by_specific_separators,
+)
+
+with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
+    FAKE_TEXT = f.read()
+
+# rag.insert(FAKE_TEXT)
+print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))
diff --git a/nano_graphrag/_op.py b/nano_graphrag/_op.py
@@ -14,7 +14,7 @@
     list_of_list_to_csv,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
-    truncate_list_by_token_size
+    truncate_list_by_token_size,
 )
 from .base import (
     BaseGraphStorage,
@@ -80,7 +80,7 @@ async def _handle_single_entity_extraction(
     record_attributes: list[str],
     chunk_key: str,
 ):
-    if record_attributes[0] != '"entity"' or len(record_attributes) < 4:
+    if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
         return None
     # add this record as a node in the G
     entity_name = clean_str(record_attributes[1].upper())
@@ -101,7 +101,7 @@ async def _handle_single_relationship_extraction(
     record_attributes: list[str],
     chunk_key: str,
 ):
-    if record_attributes[0] != '"relationship"' or len(record_attributes) < 5:
+    if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
     source = clean_str(record_attributes[1].upper())
@@ -213,10 +213,7 @@ async def _merge_edges_then_upsert(
         src_id,
         tgt_id,
         edge_data=dict(
-            weight=weight,
-            description=description,
-            source_id=source_id,
-            order=order
+            weight=weight, description=description, source_id=source_id, order=order
         ),
     )
 

diff --git a/nano_graphrag/graphrag.py b/nano_graphrag/graphrag.py
@@ -3,7 +3,7 @@
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import Type, cast
+from typing import Callable, Dict, List, Optional, Type, Union, cast
 
 
 from ._llm import (
@@ -65,6 +65,7 @@ class GraphRAG:
     enable_naive_rag: bool = False
 
     # text chunking
+    chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size
     chunk_token_size: int = 1200
     chunk_overlap_token_size: int = 100
     tiktoken_model_name: str = "gpt-4o"
@@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings):
                         **dp,
                         "full_doc_id": doc_key,
                     }
-                    for dp in chunking_by_token_size(
+                    for dp in self.chunk_func(
                         doc["content"],
                         overlap_token_size=self.chunk_overlap_token_size,
                         max_token_size=self.chunk_token_size,

diff --git a/readme.md b/readme.md
@@ -343,6 +343,12 @@ See [ROADMAP.md](./docs/ROADMAP.md)
 
 
 
+## Contribute
+
+`nano-graphrag` is open to any kind of contribution. Read [this](./docs/CONTRIBUTING.md) before you contribute.
+
+
+
 
 ## Benchmark