Skip to content

Commit

Permalink
refactor at the brink of complete
Browse files Browse the repository at this point in the history
- pluggable modules for embedding+llm models and vectordbs
- improved schema of vectordb
- use base class for vectordb implementation
- elegant argparser
- other improvements

TODO: controller using FastAPI of the app ecosystem

Signed-off-by: Anupam Kumar <[email protected]>
  • Loading branch information
kyteinsky committed Oct 19, 2023
1 parent 01751fe commit b7addfa
Show file tree
Hide file tree
Showing 21 changed files with 543 additions and 535 deletions.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ extend-ignore =
E202,
# multiple blank lines
; E303,
# lambda functions assigned to a variable
E731,

36 changes: 14 additions & 22 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
# example config below ---

# vectordb:
# name: weaviate

# embedding:
# model_path: codellama-7b.Q5_K_M.gguf
# n_ctx: 2048, # context size of the model
# n_batch: 8, # no. of tokens to process in parallel (1 <= n_batch <= n_ctx)
# n_threads: None, # no. of threads to use, None for auto (n/2 if n > 1)
# n_gpu_layers: 0, # no. of layers to be offloaded into gpu memory, -1 for all
# seed: -1, # seed, -1 for random

# llm:
# model_path: codellama-7b.Q5_K_M.gguf
# n_ctx: 2048, # context size of the model


embedding:
model_path: codellama-7b.Q5_K_M.gguf
n_ctx: 2048,

llama_embedder:
model_path: model_files/codellama-7b.Q5_K_M.gguf
n_batch: '8'
n_ctx: '2048'

llama_llm:
model_path: model_files/codellama-7b.Q5_K_M.gguf
n_batch: '10'
n_ctx: '2048'

hugging_face_small_embedder:
model_name: distilbert-base-uncased
model_kwargs:
device: cpu
Empty file added model_files/.gitkeep
Empty file.
66 changes: 2 additions & 64 deletions readme.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -4,70 +4,8 @@

1. `python -m venv .venv`
2. `. .venv/bin/activate`
3. `pip install -r reqs.txt`
4. `docker-compose -f weaviate/docker-compose.yml up -d`
3. `pip install -r reqs.txt --no-deps`
4. `docker-compose -f weaviate-docker-compose.yml up -d`
5. Download a gguf model from [Hugging Face](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF#provided-files) and place it in `models/`
6. Copy example.env to .env and fill in the variables
7. `flask run -p 9000`

## API

### Embed files into the vector database

POST `/loadFiles`

#### Body (form-data)

- [file] file1
- [string] userId


### Get all vectors for a user (hard limit: 100)

GET `/getVectors`

#### Query

- [string] userId


### Get similar vectors based on the query for a specific user (with default limit of 5)

GET `/getSimilar`

#### Query

- [string] userId
- [string] query
- [int] limit (optional)


### Answer a query based on the context gathered from vector database (default limit of docs from vector db = 5) using a LLM

GET `/ask`

#### Query

- [string] userId
- [string] query
- [int] limit (optional)


### LLM works on the query and then self-asks follow-up questions to get to the correct answer. It uses Google search to get the intermediate results.

GET `/askWithSearch`

#### Query

- [string] query


### TODO: Delete all vectors of the given filenames from the vector database

DELETE `/deleteFiles`

#### Query

- [string] userId
- [list(string)] filenames

72 changes: 72 additions & 0 deletions reqs_latest.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
aiohttp==3.8.5
aiosignal==1.3.1
annotated-types==0.5.0
async-timeout==4.0.3
attrs==23.1.0
Authlib==1.2.1
blinker==1.6.2
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.7
cryptography==41.0.4
dataclasses-json==0.5.14
diskcache==5.6.3
filelock==3.9.0
flake8==6.1.0
Flask==2.3.3
frozenlist==1.4.0
fsspec==2023.4.0
gguf==0.3.3
greenlet==2.0.2
huggingface-hub==0.17.3
idna==3.4
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.3.2
langchain==0.0.295
langsmith==0.0.38
llama_cpp_python==0.2.6
MarkupSafe==2.1.3
marshmallow==3.20.1
mccabe==0.7.0
mpmath==1.3.0
multidict==6.0.4
mypy-extensions==1.0.0
networkx==3.0
nltk==3.8.1
numexpr==2.8.6
numpy==1.26.0
packaging==23.1
pip-autoremove==0.10.0
pycodestyle==2.11.0
pycparser==2.21
pydantic==2.3.0
pydantic_core==2.6.3
pyflakes==3.1.0
python-dotenv==1.0.0
PyYAML==6.0.1
regex==2023.10.3
requests==2.31.0
ruamel.yaml==0.17.35
ruamel.yaml.clib==0.2.8
safetensors==0.4.0
scikit-learn==1.3.1
scipy==1.11.3
sentence-transformers==2.2.2
sentencepiece==0.1.99
SQLAlchemy==2.0.21
sympy==1.12
tenacity==8.2.3
threadpoolctl==3.2.0
tokenizers==0.14.1
torch==2.1.0+cpu
tqdm==4.66.1
transformers==4.34.1
typing-inspect==0.9.0
typing_extensions==4.8.0
urllib3==2.0.5
validators==0.22.0
weaviate-client==3.24.1
Werkzeug==2.3.7
yarl==1.9.2
8 changes: 8 additions & 0 deletions src/chain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .injest import embed_files, embed_texts
from .one_shot import process_query

__all__ = [
"embed_files",
"embed_texts",
"process_query",
]
100 changes: 100 additions & 0 deletions src/chain/injest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from typing import List, Iterator
from werkzeug.datastructures.file_storage import FileStorage
from langchain.vectorstores import VectorStore
from langchain.text_splitter import (
TextSplitter,
RecursiveCharacterTextSplitter,
MarkdownTextSplitter,
)

from ..utils import to_int


_ALLOWED_MIME_TYPES = [
'text/plain',
'text/markdown',
'application/json',
]


def _allowed_file(file: FileStorage) -> bool:
return file.headers \
.get('type', type=str, default='') \
.split('mimetype: ') \
.pop() in _ALLOWED_MIME_TYPES


def _get_splitter_for(mimetype: str = "text/plain") -> TextSplitter:
kwargs = {
# TODO: some storage vs performance cost tests for chunk size
"chunk_size": 2000,
"chunk_overlap": 200,
"add_start_index": True,
"strip_whitespace": True,
"is_separator_regex": True,
}

if mimetype == "text/plain" or mimetype == "":
return RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " ", ""], **kwargs)

if mimetype == "text/markdown":
return MarkdownTextSplitter(**kwargs)

if mimetype == "application/json":
return RecursiveCharacterTextSplitter(separators=["{", "}", "[", "]", ",", ""], **kwargs)


# TODO: vectordb is a langchain user client, init it before calling this function
def embed_files(vectordb: VectorStore, filesIter: Iterator[FileStorage]) -> List[str]:
print("embedding files...")

files = list(filter(_allowed_file, filesIter))

contents = []
metas = []
for file in files:
contents.append(file.stream.read().decode())
metas.append({
"source": file.name,
"type": file.headers.get("type", type=str),
"modified": to_int(file.headers.get("modified", 0)),
})

if len(contents) == 0:
return []

documents = []

for text, meta in zip(contents, metas):
text_splitter = _get_splitter_for(meta.get("type"))
docs = text_splitter.create_documents([text], [meta])
if len(docs) == 0:
continue
documents.append(docs[0])

return vectordb.add_documents(documents)


def embed_texts(vectordb: VectorStore, texts: List[dict]) -> List[str]:
print("embedding texts...")

contents = [text.get("contents") for text in texts]
metas = [{
"source": text.get("name"),
"type": text.get("type"),
"modified": to_int(text.get("modified", 0)),
} for text in texts]

if len(contents) == 0:
return []

documents = []

for text, meta in zip(contents, metas):
text_splitter = _get_splitter_for(meta.get("type"))
docs = text_splitter.create_documents([text], [meta])
if len(docs) == 0:
continue
documents.append(docs[0])

return vectordb.add_documents(documents)
31 changes: 31 additions & 0 deletions src/chain/one_shot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any
from langchain.vectorstores import VectorStore
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.llms import LLM

_LLM_TEMPLATE = """Answer based on this context and do not add any imaginative details:
{context}
{question}
"""


def process_query(vectordb: VectorStore, llm: LLM, query: str, limit: int = 5) -> Any | None:
context_docs = vectordb.similarity_search(query, k=limit)

print(f"Context docs: {context_docs}")

context_text = " ".join(map(lambda d: d.page_content, context_docs))

prompt = PromptTemplate(
template=_LLM_TEMPLATE,
input_variables=["context", "question"]
)
llm_chain = LLMChain(prompt=prompt, llm=llm)

output = llm_chain.run(question=query, context=context_text)
print(f'Output: {output}')

return output

Loading

0 comments on commit b7addfa

Please sign in to comment.