forked from gusye1234/nano-graphrag
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/gusye1234/nano-graphrag
- Loading branch information
Showing
5 changed files
with
100 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Contributing to nano-graphrag | ||
|
||
### Submit your Contribution through PR | ||
|
||
To make a contribution, follow these steps: | ||
|
||
1. Fork and clone this repository | ||
3. If you modified the core code (`./nano_graphrag`), please add tests for it | ||
4. **Include proper documentation / docstring or examples** | ||
5. Ensure that all tests pass by running `pytest` | ||
6. Submit a pull request | ||
|
||
For more details about pull requests, please read [GitHub's guides](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). | ||
|
||
|
||
|
||
### Only add a dependency when we have to | ||
|
||
`nano-graphrag` needs to be `nano` and `light`. If we want to add more features, we add them smartly. Don't introduce a huge dependency just for a simple function. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
|
||
|
||
from nano_graphrag._utils import encode_string_by_tiktoken | ||
from nano_graphrag.base import QueryParam | ||
from nano_graphrag.graphrag import GraphRAG | ||
|
||
|
||
def chunking_by_specific_separators( | ||
content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", | ||
): | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
|
||
|
||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=max_token_size, | ||
chunk_overlap=overlap_token_size, | ||
# length_function=lambda x: len(encode_string_by_tiktoken(x)), | ||
model_name=tiktoken_model, | ||
is_separator_regex=False, | ||
separators=[ | ||
# Paragraph separators | ||
"\n\n", | ||
"\r\n\r\n", | ||
# Line breaks | ||
"\n", | ||
"\r\n", | ||
# Sentence ending punctuation | ||
"。", # Chinese period | ||
".", # Full-width dot | ||
".", # English period | ||
"!", # Chinese exclamation mark | ||
"!", # English exclamation mark | ||
"?", # Chinese question mark | ||
"?", # English question mark | ||
# Whitespace characters | ||
" ", # Space | ||
"\t", # Tab | ||
"\u3000", # Full-width space | ||
# Special characters | ||
"\u200b", # Zero-width space (used in some Asian languages) | ||
# Final fallback | ||
"", | ||
]) | ||
texts = text_splitter.split_text(content) | ||
|
||
results = [] | ||
for index, chunk_content in enumerate(texts): | ||
|
||
results.append( | ||
{ | ||
# "tokens": None, | ||
"content": chunk_content.strip(), | ||
"chunk_order_index": index, | ||
} | ||
) | ||
return results | ||
|
||
|
||
WORKING_DIR = "./nano_graphrag_cache_local_embedding_TEST" | ||
rag = GraphRAG( | ||
working_dir=WORKING_DIR, | ||
chunk_func=chunking_by_specific_separators, | ||
) | ||
|
||
with open("../tests/mock_data.txt", encoding="utf-8-sig") as f: | ||
FAKE_TEXT = f.read() | ||
|
||
# rag.insert(FAKE_TEXT) | ||
print(rag.query("What the main theme of this story?", param=QueryParam(mode="local"))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters