refactor at the brink of complete

- pluggable modules for embedding+llm models and vectordbs - improved schema of vectordb - use base class for vectordb implementation - elegant argparser - other improvements TODO: controller using FastAPI of the app ecosystem Signed-off-by: Anupam Kumar <[email protected]>
nextcloud · Oct 19, 2023 · b7addfa · b7addfa
1 parent 01751fe
commit b7addfa
Show file tree

Hide file tree

Showing 21 changed files with 543 additions and 535 deletions.
diff --git a/.flake8 b/.flake8
@@ -10,4 +10,6 @@ extend-ignore =
 	E202,
 	# multiple blank lines
 	; E303,
+	# lambda functions assigned to a variable
+	E731,
 
diff --git a/config.yaml b/config.yaml
@@ -1,22 +1,14 @@
-# example config below ---
-
-# vectordb:
-#   name: weaviate
-
-# embedding:
-#   model_path: codellama-7b.Q5_K_M.gguf
-#   n_ctx: 2048,  # context size of the model
-#   n_batch: 8,  # no. of tokens to process in parallel (1 <= n_batch <= n_ctx)
-#   n_threads: None,  # no. of threads to use, None for auto (n/2 if n > 1)
-#   n_gpu_layers: 0,  # no. of layers to be offloaded into gpu memory, -1 for all
-#   seed: -1,  # seed, -1 for random
-
-# llm:
-#   model_path: codellama-7b.Q5_K_M.gguf
-#   n_ctx: 2048,  # context size of the model
-
-
-embedding:
-  model_path: codellama-7b.Q5_K_M.gguf
-  n_ctx: 2048,
-
+llama_embedder:
+  model_path: model_files/codellama-7b.Q5_K_M.gguf
+  n_batch: '8'
+  n_ctx: '2048'
+
+llama_llm:
+  model_path: model_files/codellama-7b.Q5_K_M.gguf
+  n_batch: '10'
+  n_ctx: '2048'
+
+hugging_face_small_embedder:
+  model_name: distilbert-base-uncased
+  model_kwargs:
+    device: cpu
diff --git a/model_files/.gitkeep b/model_files/.gitkeep
diff --git a/readme.markdown b/readme.markdown
@@ -4,70 +4,8 @@
 
 1. `python -m venv .venv`
 2. `. .venv/bin/activate`
-3. `pip install -r reqs.txt`
-4. `docker-compose -f weaviate/docker-compose.yml up -d`
+3. `pip install -r reqs.txt --no-deps`
+4. `docker-compose -f weaviate-docker-compose.yml up -d`
 5. Download a gguf model from [Hugging Face](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF#provided-files) and place it in `models/`
 6. Copy example.env to .env and fill in the variables
 7. `flask run -p 9000`
-
-## API
-
-### Embed files into the vector database
-
-POST `/loadFiles`
-
-#### Body (form-data)
-
-- [file] file1
-- [string] userId
-
-
-### Get all vectors for a user (hard limit: 100)
-
-GET `/getVectors`
-
-#### Query
-
-- [string] userId
-
-
-### Get similar vectors based on the query for a specific user (with default limit of 5)
-
-GET `/getSimilar`
-
-#### Query
-
-- [string] userId
-- [string] query
-- [int] limit (optional)
-
-
-### Answer a query based on the context gathered from vector database (default limit of docs from vector db = 5) using a LLM
-
-GET `/ask`
-
-#### Query
-
-- [string] userId
-- [string] query
-- [int] limit (optional)
-
-
-### LLM works on the query and then self-asks follow-up questions to get to the correct answer. It uses Google search to get the intermediate results.
-
-GET `/askWithSearch`
-
-#### Query
-
-- [string] query
-
-
-### TODO: Delete all vectors of the given filenames from the vector database
-
-DELETE `/deleteFiles`
-
-#### Query
-
-- [string] userId
-- [list(string)] filenames
-
diff --git a/reqs_latest.txt b/reqs_latest.txt
@@ -0,0 +1,72 @@
+aiohttp==3.8.5
+aiosignal==1.3.1
+annotated-types==0.5.0
+async-timeout==4.0.3
+attrs==23.1.0
+Authlib==1.2.1
+blinker==1.6.2
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+click==8.1.7
+cryptography==41.0.4
+dataclasses-json==0.5.14
+diskcache==5.6.3
+filelock==3.9.0
+flake8==6.1.0
+Flask==2.3.3
+frozenlist==1.4.0
+fsspec==2023.4.0
+gguf==0.3.3
+greenlet==2.0.2
+huggingface-hub==0.17.3
+idna==3.4
+itsdangerous==2.1.2
+Jinja2==3.1.2
+joblib==1.3.2
+langchain==0.0.295
+langsmith==0.0.38
+llama_cpp_python==0.2.6
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+mccabe==0.7.0
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.0
+nltk==3.8.1
+numexpr==2.8.6
+numpy==1.26.0
+packaging==23.1
+pip-autoremove==0.10.0
+pycodestyle==2.11.0
+pycparser==2.21
+pydantic==2.3.0
+pydantic_core==2.6.3
+pyflakes==3.1.0
+python-dotenv==1.0.0
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+ruamel.yaml==0.17.35
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.0
+scikit-learn==1.3.1
+scipy==1.11.3
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+SQLAlchemy==2.0.21
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.14.1
+torch==2.1.0+cpu
+tqdm==4.66.1
+transformers==4.34.1
+typing-inspect==0.9.0
+typing_extensions==4.8.0
+urllib3==2.0.5
+validators==0.22.0
+weaviate-client==3.24.1
+Werkzeug==2.3.7
+yarl==1.9.2
diff --git a/src/chain/__init__.py b/src/chain/__init__.py
@@ -0,0 +1,8 @@
+from .injest import embed_files, embed_texts
+from .one_shot import process_query
+
+__all__ = [
+	"embed_files",
+	"embed_texts",
+	"process_query",
+]
diff --git a/src/chain/injest.py b/src/chain/injest.py
@@ -0,0 +1,100 @@
+from typing import List, Iterator
+from werkzeug.datastructures.file_storage import FileStorage
+from langchain.vectorstores import VectorStore
+from langchain.text_splitter import (
+	TextSplitter,
+	RecursiveCharacterTextSplitter,
+	MarkdownTextSplitter,
+)
+
+from ..utils import to_int
+
+
+_ALLOWED_MIME_TYPES = [
+	'text/plain',
+	'text/markdown',
+	'application/json',
+]
+
+
+def _allowed_file(file: FileStorage) -> bool:
+	return file.headers \
+		.get('type', type=str, default='') \
+		.split('mimetype: ') \
+		.pop() in _ALLOWED_MIME_TYPES
+
+
+def _get_splitter_for(mimetype: str = "text/plain") -> TextSplitter:
+	kwargs = {
+		# TODO: some storage vs performance cost tests for chunk size
+		"chunk_size": 2000,
+		"chunk_overlap": 200,
+		"add_start_index": True,
+		"strip_whitespace": True,
+		"is_separator_regex": True,
+	}
+
+	if mimetype == "text/plain" or mimetype == "":
+		return RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " ", ""], **kwargs)
+
+	if mimetype == "text/markdown":
+		return MarkdownTextSplitter(**kwargs)
+
+	if mimetype == "application/json":
+		return RecursiveCharacterTextSplitter(separators=["{", "}", "[", "]", ",", ""], **kwargs)
+
+
+# TODO: vectordb is a langchain user client, init it before calling this function
+def embed_files(vectordb: VectorStore, filesIter: Iterator[FileStorage]) -> List[str]:
+	print("embedding files...")
+
+	files = list(filter(_allowed_file, filesIter))
+
+	contents = []
+	metas = []
+	for file in files:
+		contents.append(file.stream.read().decode())
+		metas.append({
+			"source": file.name,
+			"type": file.headers.get("type", type=str),
+			"modified": to_int(file.headers.get("modified", 0)),
+		})
+
+	if len(contents) == 0:
+		return []
+
+	documents = []
+
+	for text, meta in zip(contents, metas):
+		text_splitter = _get_splitter_for(meta.get("type"))
+		docs = text_splitter.create_documents([text], [meta])
+		if len(docs) == 0:
+			continue
+		documents.append(docs[0])
+
+	return vectordb.add_documents(documents)
+
+
+def embed_texts(vectordb: VectorStore, texts: List[dict]) -> List[str]:
+	print("embedding texts...")
+
+	contents = [text.get("contents") for text in texts]
+	metas = [{
+		"source": text.get("name"),
+		"type": text.get("type"),
+		"modified": to_int(text.get("modified", 0)),
+	} for text in texts]
+
+	if len(contents) == 0:
+		return []
+
+	documents = []
+
+	for text, meta in zip(contents, metas):
+		text_splitter = _get_splitter_for(meta.get("type"))
+		docs = text_splitter.create_documents([text], [meta])
+		if len(docs) == 0:
+			continue
+		documents.append(docs[0])
+
+	return vectordb.add_documents(documents)
diff --git a/src/chain/one_shot.py b/src/chain/one_shot.py
@@ -0,0 +1,31 @@
+from typing import Any
+from langchain.vectorstores import VectorStore
+from langchain.chains import LLMChain
+from langchain import PromptTemplate
+from langchain.llms import LLM
+
+_LLM_TEMPLATE = """Answer based on this context and do not add any imaginative details:
+{context}
+
+{question}
+"""
+
+
+def process_query(vectordb: VectorStore, llm: LLM, query: str, limit: int = 5) -> Any | None:
+	context_docs = vectordb.similarity_search(query, k=limit)
+
+	print(f"Context docs: {context_docs}")
+
+	context_text = " ".join(map(lambda d: d.page_content, context_docs))
+
+	prompt = PromptTemplate(
+		template=_LLM_TEMPLATE,
+		input_variables=["context", "question"]
+	)
+	llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+	output = llm_chain.run(question=query, context=context_text)
+	print(f'Output: {output}')
+
+	return output
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,4 +10,6 @@ extend-ignore = @@
     	E202,
     	# multiple blank lines
     	; E303,
+    	# lambda functions assigned to a variable
+    	E731,