Skip to content

Commit

Permalink
support new content providers
Browse files Browse the repository at this point in the history
- match containing string in source names
- add HTTPX_VERIFY_SSL to allow self-signed certs
- add /search for testing

Signed-off-by: Anupam Kumar <[email protected]>
  • Loading branch information
kyteinsky committed Feb 9, 2024
1 parent 7d8f2f7 commit aa39601
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 31 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ COPY context_chat_backend context_chat_backend
COPY main.py .
COPY config.yaml .

CMD ["python3", "main.py"]
ENTRYPOINT ["python3", "main.py"]
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ register28:
docker exec master-nextcloud-1 sudo -u www-data php occ app_api:app:unregister context_chat_backend --silent || true
docker exec master-nextcloud-1 sudo -u www-data php occ app_api:app:register context_chat_backend manual_install --json-info \
"{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"1.0.0\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" \
--force-scopes
--force-scopes --wait-finish

4 changes: 2 additions & 2 deletions appinfo/info.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Three mandatory apps are required for this app to work (can be installed from th
This app then should be installed from the "External Apps" page.
]]></description>
<version>1.0.1</version>
<version>1.0.2</version>
<licence>agpl</licence>
<author mail="[email protected]" homepage="https://github.com/kyteinsky">Anupam Kumar</author>
<namespace>Context Chat</namespace>
Expand All @@ -29,7 +29,7 @@ This app then should be installed from the "External Apps" page.
<docker-install>
<registry>ghcr.io</registry>
<image>nextcloud/context_chat_backend</image>
<image-tag>1.0.0</image-tag>
<image-tag>1.0.2</image-tag>
</docker-install>
<scopes>
<required>
Expand Down
47 changes: 46 additions & 1 deletion context_chat_backend/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ def _(userId: str):
)


# TODO: for testing, remove later
@app.get('/search')
def _(userId: str, keyword: str):
from chromadb import ClientAPI
from .utils import COLLECTION_NAME

db: BaseVectorDB = app.extra.get('VECTOR_DB')
client: ClientAPI = db.client
db.setup_schema(userId)

return JSONResponse(
client.get_collection(COLLECTION_NAME(userId)).get(
where_document={'$contains': [{'source': keyword}]},
include=['metadatas'],
)
)


@app.put('/enabled')
def _(enabled: bool):
print(f'{enabled:}')
Expand All @@ -108,7 +126,7 @@ def _(userId: Annotated[str, Body()], sourceNames: Annotated[list[str], Body()])
if db is None:
return JSONResponse('Error: VectorDB not initialised', 500)

source_objs = db.get_objects_from_sources(userId, sourceNames)
source_objs = db.get_objects_from_metadata(userId, 'source', sourceNames)
res = db.delete_by_ids(userId, [
source.get('id')
for source in source_objs.values()
Expand All @@ -128,6 +146,33 @@ def _(userId: Annotated[str, Body()], sourceNames: Annotated[list[str], Body()])
return JSONResponse('All valid sources deleted')


@app.post('/deleteMatchingSources')
def _(userId: Annotated[str, Body()], keyword: Annotated[str, Body()]):
db: BaseVectorDB = app.extra.get('VECTOR_DB')

if db is None:
return JSONResponse('Error: VectorDB not initialised', 500)

objs = db.get_objects_from_metadata(userId, 'source', [keyword], True)
res = db.delete_by_ids(userId, [
obj.get('id')
for obj in objs.values()
if value_of(obj.get('id') is not None)
])

# NOTE: None returned in `delete_by_ids` should have meant an error but it didn't in the case of
# weaviate maybe because of the way weaviate wrapper is implemented (langchain's api does not take
# class name as input, which will be required in future versions of weaviate)
if res is None:
print('Deletion query returned "None". This can happen in Weaviate even if the deletion was \
successful, therefore not considered an error for now.')

if res is False:
return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400)

return JSONResponse('All valid sources deleted')


@app.put('/loadSources')
def _(sources: list[UploadFile]):
if len(sources) == 0:
Expand Down
3 changes: 3 additions & 0 deletions context_chat_backend/ocs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,15 @@ def ocs_call(

_sign_request(headers, kwargs.get('username', ''))

verify_ssl = getenv('HTTPX_VERIFY_SSL', '1') == '1'

return httpx.request(
method=method.upper(),
url=f'{get_nc_url()}/{path.removeprefix("/")}',
params=params,
content=data_bytes,
headers=headers,
verify=verify_ssl,
**kwargs,
)

23 changes: 17 additions & 6 deletions context_chat_backend/vectordb/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,28 @@ def setup_schema(self, user_id: str) -> None:
'''

@abstractmethod
def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dict:
def get_objects_from_metadata(
self,
user_id: str,
metadata_key: str,
values: List[str],
contains: bool = False,
) -> dict:
'''
Get all objects with the given source names.
(Only gets the following fields: [id, source, modified])
Get all objects with the given metadata key and values.
(Only gets the following fields: [id, 'metadata_key', modified])
Args
----
user_id: str
User ID for whose database to get the sources.
source_names: List[str]
List of source names to get.
metadata_key: str
Metadata key to get.
values: List[str]
List of metadata names to get.
contains: bool
If True, gets all objects that contain any of the given values,
otherwise gets all objects that have the given values.
Returns
-------
Expand All @@ -71,7 +82,7 @@ def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dic
otherwise:
{
['source': str]: {
['metadata_key': str]: {
'id': str,
'modified': str,
}
Expand Down
28 changes: 22 additions & 6 deletions context_chat_backend/vectordb/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,37 @@ def get_user_client(
embedding_function=em,
)

def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dict:
def get_objects_from_metadata(
self,
user_id: str,
metadata_key: str,
values: List[str],
contains: bool = False,
) -> dict:
# NOTE: the limit of objects returned is not known, maybe it would be better to set one manually

if not self.client:
raise Exception('Error: Chromadb client not initialised')

self.setup_schema(user_id)

sources_filter = {'$or': [{ 'source': source } for source in source_names]}
# placeholder 'or' for single source above
sources_filter['$or'].append({ '': { '$in': source_names } })
if len(values) == 0:
return {}

if len(values) == 1:
if contains:
data_filter = { metadata_key: { '$in': values[0] } }
else:
data_filter = { metadata_key: values[0] }
else:
if contains:
data_filter = {'$or': [{ metadata_key: { '$in': val } } for val in values]}
else:
data_filter = {'$or': [{ metadata_key: val } for val in values]}

try:
results = self.client.get_collection(COLLECTION_NAME(user_id)).get(
where=sources_filter,
where=data_filter,
include=['metadatas']
)
except Exception as e:
Expand All @@ -88,7 +104,7 @@ def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dic
try:
for i, _id in enumerate(results.get('ids')):
meta = results['metadatas'][i]
output[meta['source']] = {
output[meta[metadata_key]] = {
'id': _id,
'modified': meta['modified'],
}
Expand Down
43 changes: 32 additions & 11 deletions context_chat_backend/vectordb/weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,13 @@ def get_user_client(

return weaviate_obj

def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dict:
def get_objects_from_metadata(
self,
user_id: str,
metadata_key: str,
values: List[str],
contains: bool = False,
) -> dict:
# NOTE: the limit of objects returned is not known, maybe it would be better to set one manually

if not self.client:
Expand All @@ -129,35 +135,50 @@ def get_objects_from_sources(self, user_id: str, source_names: List[str]) -> dic
if not self.client.schema.exists(COLLECTION_NAME(user_id)):
self.setup_schema(user_id)

file_filter = {
'path': ['source'],
if len(values) == 0:
return {}

# todo
if len(values) == 1:
if contains:
data_filter = { metadata_key: { '$in': values[0] } }
else:
data_filter = { metadata_key: values[0] }
else:
if contains:
data_filter = {'$or': [{ metadata_key: { '$in': val } } for val in values]}
else:
data_filter = {'$or': [{ metadata_key: val } for val in values]}

data_filter = {
'path': [metadata_key],
'operator': 'ContainsAny',
'valueTextList': source_names,
'valueTextList': values,
}

results = self.client.query \
.get(COLLECTION_NAME(user_id), ['source', 'modified']) \
.get(COLLECTION_NAME(user_id), [metadata_key, 'modified']) \
.with_additional('id') \
.with_where(file_filter) \
.with_where(data_filter) \
.do()

if results.get('errors') is not None:
log_error(f'Error: Weaviate query error: {results.get("errors")}')
return {}

dsources = {}
for source in source_names:
dsources[source] = True
dmeta = {}
for val in values:
dmeta[val] = True

try:
results = results['data']['Get'][COLLECTION_NAME(user_id)]
output = {}
for result in results:
# case sensitive matching
if dsources.get(result['source']) is None:
if dmeta.get(result[metadata_key]) is None:
continue

output[result['source']] = {
output[result[metadata_key]] = {
'id': result['_additional']['id'],
'modified': result['modified'],
}
Expand Down
5 changes: 3 additions & 2 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

DEBUG=1
DISABLE_AAA=0
HTTPX_VERIFY_SSL=0

# Model files directory
SENTENCE_TRANSFORMERS_HOME=./model_files
Expand All @@ -26,10 +27,10 @@ AA_VERSION=1.4.4
APP_SECRET=12345
APP_ID=context_chat_backend
APP_DISPLAY_NAME=Context Chat Backend
APP_VERSION=1.0.0
APP_VERSION=1.0.2
APP_PROTOCOL=http
APP_HOST=0.0.0.0
APP_PORT=10034
APP_PERSISTENT_STORAGE=1
APP_PERSISTENT_STORAGE=
IS_SYSTEM_APP=0
NEXTCLOUD_URL=http://nextcloud.local
2 changes: 1 addition & 1 deletion readme.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
```
occ app_api:app:register context_chat_backend manual_install --json-info \
"{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"1.0.2\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" \
--force-scopes
--force-scopes --wait-finish
```

The command to unregister is given below (force is used to also remove apps whose container has been removed)
Expand Down

0 comments on commit aa39601

Please sign in to comment.