Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
actions-user committed Sep 18, 2024
2 parents 0a0101f + f11e9f2 commit 4e3fb7d
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 9 deletions.
19 changes: 19 additions & 0 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Contributing to nano-graphrag

### Submit your Contribution through PR

To make a contribution, follow these steps:

1. Fork and clone this repository
3. If you modified the core code (`./nano_graphrag`), please add tests for it
4. **Include proper documentation / docstring or examples**
5. Ensure that all tests pass by running `pytest`
6. Submit a pull request

For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).



### Only add a dependency when we have to

`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function.
68 changes: 68 additions & 0 deletions examples/using_custom_chunking_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@


from nano_graphrag._utils import encode_string_by_tiktoken
from nano_graphrag.base import QueryParam
from nano_graphrag.graphrag import GraphRAG


def chunking_by_specific_separators(
content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o",
):
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size,
chunk_overlap=overlap_token_size,
# length_function=lambda x: len(encode_string_by_tiktoken(x)),
model_name=tiktoken_model,
is_separator_regex=False,
separators=[
# Paragraph separators
"\n\n",
"\r\n\r\n",
# Line breaks
"\n",
"\r\n",
# Sentence ending punctuation
"。", # Chinese period
".", # Full-width dot
".", # English period
"!", # Chinese exclamation mark
"!", # English exclamation mark
"?", # Chinese question mark
"?", # English question mark
# Whitespace characters
" ", # Space
"\t", # Tab
"\u3000", # Full-width space
# Special characters
"\u200b", # Zero-width space (used in some Asian languages)
# Final fallback
"",
])
texts = text_splitter.split_text(content)

results = []
for index, chunk_content in enumerate(texts):

results.append(
{
# "tokens": None,
"content": chunk_content.strip(),
"chunk_order_index": index,
}
)
return results


WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST"
rag = GraphRAG(
working_dir=WORKING_DIR,
chunk_func=chunking_by_specific_separators,
)

with open("../tests/mock_data.txt", encoding="utf-8-sig") as f:
FAKE_TEXT = f.read()

# rag.insert(FAKE_TEXT)
print(rag.query("What the main theme of this story?", param=QueryParam(mode="local")))
11 changes: 4 additions & 7 deletions nano_graphrag/_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
list_of_list_to_csv,
pack_user_ass_to_openai_messages,
split_string_by_multi_markers,
truncate_list_by_token_size
truncate_list_by_token_size,
)
from .base import (
BaseGraphStorage,
Expand Down Expand Up @@ -80,7 +80,7 @@ async def _handle_single_entity_extraction(
record_attributes: list[str],
chunk_key: str,
):
if record_attributes[0] != '"entity"' or len(record_attributes) < 4:
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
return None
# add this record as a node in the G
entity_name = clean_str(record_attributes[1].upper())
Expand All @@ -101,7 +101,7 @@ async def _handle_single_relationship_extraction(
record_attributes: list[str],
chunk_key: str,
):
if record_attributes[0] != '"relationship"' or len(record_attributes) < 5:
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
return None
# add this record as edge
source = clean_str(record_attributes[1].upper())
Expand Down Expand Up @@ -213,10 +213,7 @@ async def _merge_edges_then_upsert(
src_id,
tgt_id,
edge_data=dict(
weight=weight,
description=description,
source_id=source_id,
order=order
weight=weight, description=description, source_id=source_id, order=order
),
)

Expand Down
5 changes: 3 additions & 2 deletions nano_graphrag/graphrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import asdict, dataclass, field
from datetime import datetime
from functools import partial
from typing import Type, cast
from typing import Callable, Dict, List, Optional, Type, Union, cast


from ._llm import (
Expand Down Expand Up @@ -65,6 +65,7 @@ class GraphRAG:
enable_naive_rag: bool = False

# text chunking
chunk_func: Callable[[str, Optional[int], Optional[int], Optional[str]], List[Dict[str, Union[str, int]]]] = chunking_by_token_size
chunk_token_size: int = 1200
chunk_overlap_token_size: int = 100
tiktoken_model_name: str = "gpt-4o"
Expand Down Expand Up @@ -269,7 +270,7 @@ async def ainsert(self, string_or_strings):
**dp,
"full_doc_id": doc_key,
}
for dp in chunking_by_token_size(
for dp in self.chunk_func(
doc["content"],
overlap_token_size=self.chunk_overlap_token_size,
max_token_size=self.chunk_token_size,
Expand Down
6 changes: 6 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,12 @@ See [ROADMAP.md](./docs/ROADMAP.md)



## Contribute

`nano-graphrag` is open to any kind of contribution. Read [this](./docs/CONTRIBUTING.md) before you contribute.




## Benchmark

Expand Down

0 comments on commit 4e3fb7d

Please sign in to comment.