-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- pluggable modules for embedding+llm models and vectordbs - improved schema of vectordb - use base class for vectordb implementation - elegant argparser - other improvements TODO: controller using FastAPI of the app ecosystem Signed-off-by: Anupam Kumar <[email protected]>
- Loading branch information
Showing
21 changed files
with
543 additions
and
535 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,4 +10,6 @@ extend-ignore = | |
E202, | ||
# multiple blank lines | ||
; E303, | ||
# lambda functions assigned to a variable | ||
E731, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,14 @@ | ||
# example config below --- | ||
|
||
# vectordb: | ||
# name: weaviate | ||
|
||
# embedding: | ||
# model_path: codellama-7b.Q5_K_M.gguf | ||
# n_ctx: 2048, # context size of the model | ||
# n_batch: 8, # no. of tokens to process in parallel (1 <= n_batch <= n_ctx) | ||
# n_threads: None, # no. of threads to use, None for auto (n/2 if n > 1) | ||
# n_gpu_layers: 0, # no. of layers to be offloaded into gpu memory, -1 for all | ||
# seed: -1, # seed, -1 for random | ||
|
||
# llm: | ||
# model_path: codellama-7b.Q5_K_M.gguf | ||
# n_ctx: 2048, # context size of the model | ||
|
||
|
||
embedding: | ||
model_path: codellama-7b.Q5_K_M.gguf | ||
n_ctx: 2048, | ||
|
||
llama_embedder: | ||
model_path: model_files/codellama-7b.Q5_K_M.gguf | ||
n_batch: '8' | ||
n_ctx: '2048' | ||
|
||
llama_llm: | ||
model_path: model_files/codellama-7b.Q5_K_M.gguf | ||
n_batch: '10' | ||
n_ctx: '2048' | ||
|
||
hugging_face_small_embedder: | ||
model_name: distilbert-base-uncased | ||
model_kwargs: | ||
device: cpu |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
aiohttp==3.8.5 | ||
aiosignal==1.3.1 | ||
annotated-types==0.5.0 | ||
async-timeout==4.0.3 | ||
attrs==23.1.0 | ||
Authlib==1.2.1 | ||
blinker==1.6.2 | ||
certifi==2023.7.22 | ||
cffi==1.15.1 | ||
charset-normalizer==3.2.0 | ||
click==8.1.7 | ||
cryptography==41.0.4 | ||
dataclasses-json==0.5.14 | ||
diskcache==5.6.3 | ||
filelock==3.9.0 | ||
flake8==6.1.0 | ||
Flask==2.3.3 | ||
frozenlist==1.4.0 | ||
fsspec==2023.4.0 | ||
gguf==0.3.3 | ||
greenlet==2.0.2 | ||
huggingface-hub==0.17.3 | ||
idna==3.4 | ||
itsdangerous==2.1.2 | ||
Jinja2==3.1.2 | ||
joblib==1.3.2 | ||
langchain==0.0.295 | ||
langsmith==0.0.38 | ||
llama_cpp_python==0.2.6 | ||
MarkupSafe==2.1.3 | ||
marshmallow==3.20.1 | ||
mccabe==0.7.0 | ||
mpmath==1.3.0 | ||
multidict==6.0.4 | ||
mypy-extensions==1.0.0 | ||
networkx==3.0 | ||
nltk==3.8.1 | ||
numexpr==2.8.6 | ||
numpy==1.26.0 | ||
packaging==23.1 | ||
pip-autoremove==0.10.0 | ||
pycodestyle==2.11.0 | ||
pycparser==2.21 | ||
pydantic==2.3.0 | ||
pydantic_core==2.6.3 | ||
pyflakes==3.1.0 | ||
python-dotenv==1.0.0 | ||
PyYAML==6.0.1 | ||
regex==2023.10.3 | ||
requests==2.31.0 | ||
ruamel.yaml==0.17.35 | ||
ruamel.yaml.clib==0.2.8 | ||
safetensors==0.4.0 | ||
scikit-learn==1.3.1 | ||
scipy==1.11.3 | ||
sentence-transformers==2.2.2 | ||
sentencepiece==0.1.99 | ||
SQLAlchemy==2.0.21 | ||
sympy==1.12 | ||
tenacity==8.2.3 | ||
threadpoolctl==3.2.0 | ||
tokenizers==0.14.1 | ||
torch==2.1.0+cpu | ||
tqdm==4.66.1 | ||
transformers==4.34.1 | ||
typing-inspect==0.9.0 | ||
typing_extensions==4.8.0 | ||
urllib3==2.0.5 | ||
validators==0.22.0 | ||
weaviate-client==3.24.1 | ||
Werkzeug==2.3.7 | ||
yarl==1.9.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from .injest import embed_files, embed_texts | ||
from .one_shot import process_query | ||
|
||
__all__ = [ | ||
"embed_files", | ||
"embed_texts", | ||
"process_query", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
from typing import List, Iterator | ||
from werkzeug.datastructures.file_storage import FileStorage | ||
from langchain.vectorstores import VectorStore | ||
from langchain.text_splitter import ( | ||
TextSplitter, | ||
RecursiveCharacterTextSplitter, | ||
MarkdownTextSplitter, | ||
) | ||
|
||
from ..utils import to_int | ||
|
||
|
||
_ALLOWED_MIME_TYPES = [ | ||
'text/plain', | ||
'text/markdown', | ||
'application/json', | ||
] | ||
|
||
|
||
def _allowed_file(file: FileStorage) -> bool: | ||
return file.headers \ | ||
.get('type', type=str, default='') \ | ||
.split('mimetype: ') \ | ||
.pop() in _ALLOWED_MIME_TYPES | ||
|
||
|
||
def _get_splitter_for(mimetype: str = "text/plain") -> TextSplitter: | ||
kwargs = { | ||
# TODO: some storage vs performance cost tests for chunk size | ||
"chunk_size": 2000, | ||
"chunk_overlap": 200, | ||
"add_start_index": True, | ||
"strip_whitespace": True, | ||
"is_separator_regex": True, | ||
} | ||
|
||
if mimetype == "text/plain" or mimetype == "": | ||
return RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " ", ""], **kwargs) | ||
|
||
if mimetype == "text/markdown": | ||
return MarkdownTextSplitter(**kwargs) | ||
|
||
if mimetype == "application/json": | ||
return RecursiveCharacterTextSplitter(separators=["{", "}", "[", "]", ",", ""], **kwargs) | ||
|
||
|
||
# TODO: vectordb is a langchain user client, init it before calling this function | ||
def embed_files(vectordb: VectorStore, filesIter: Iterator[FileStorage]) -> List[str]: | ||
print("embedding files...") | ||
|
||
files = list(filter(_allowed_file, filesIter)) | ||
|
||
contents = [] | ||
metas = [] | ||
for file in files: | ||
contents.append(file.stream.read().decode()) | ||
metas.append({ | ||
"source": file.name, | ||
"type": file.headers.get("type", type=str), | ||
"modified": to_int(file.headers.get("modified", 0)), | ||
}) | ||
|
||
if len(contents) == 0: | ||
return [] | ||
|
||
documents = [] | ||
|
||
for text, meta in zip(contents, metas): | ||
text_splitter = _get_splitter_for(meta.get("type")) | ||
docs = text_splitter.create_documents([text], [meta]) | ||
if len(docs) == 0: | ||
continue | ||
documents.append(docs[0]) | ||
|
||
return vectordb.add_documents(documents) | ||
|
||
|
||
def embed_texts(vectordb: VectorStore, texts: List[dict]) -> List[str]: | ||
print("embedding texts...") | ||
|
||
contents = [text.get("contents") for text in texts] | ||
metas = [{ | ||
"source": text.get("name"), | ||
"type": text.get("type"), | ||
"modified": to_int(text.get("modified", 0)), | ||
} for text in texts] | ||
|
||
if len(contents) == 0: | ||
return [] | ||
|
||
documents = [] | ||
|
||
for text, meta in zip(contents, metas): | ||
text_splitter = _get_splitter_for(meta.get("type")) | ||
docs = text_splitter.create_documents([text], [meta]) | ||
if len(docs) == 0: | ||
continue | ||
documents.append(docs[0]) | ||
|
||
return vectordb.add_documents(documents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from typing import Any | ||
from langchain.vectorstores import VectorStore | ||
from langchain.chains import LLMChain | ||
from langchain import PromptTemplate | ||
from langchain.llms import LLM | ||
|
||
_LLM_TEMPLATE = """Answer based on this context and do not add any imaginative details: | ||
{context} | ||
{question} | ||
""" | ||
|
||
|
||
def process_query(vectordb: VectorStore, llm: LLM, query: str, limit: int = 5) -> Any | None: | ||
context_docs = vectordb.similarity_search(query, k=limit) | ||
|
||
print(f"Context docs: {context_docs}") | ||
|
||
context_text = " ".join(map(lambda d: d.page_content, context_docs)) | ||
|
||
prompt = PromptTemplate( | ||
template=_LLM_TEMPLATE, | ||
input_variables=["context", "question"] | ||
) | ||
llm_chain = LLMChain(prompt=prompt, llm=llm) | ||
|
||
output = llm_chain.run(question=query, context=context_text) | ||
print(f'Output: {output}') | ||
|
||
return output | ||
|
Oops, something went wrong.