Skip to content

Commit

Permalink
Fix/file upload explore (#412)
Browse files Browse the repository at this point in the history
  • Loading branch information
gozineb authored Jun 29, 2023
1 parent ed61880 commit 4d9bd51
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 79 deletions.
28 changes: 18 additions & 10 deletions backend/models/brains.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def brain_size(self):
def remaining_brain_size(self):
return float(self.max_brain_size) - self.brain_size


@classmethod
def create(cls, *args, **kwargs):
commons = common_dependencies()
Expand Down Expand Up @@ -79,18 +78,17 @@ def create_brain(self):
self.id = response.data[0]['brain_id']
return response.data

def create_brain_user(self, user_id : UUID, rights, default_brain):
def create_brain_user(self, user_id: UUID, rights, default_brain):
commons = common_dependencies()
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id":str( user_id), "rights": rights, "default_brain": default_brain}).execute()
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id": str(user_id), "rights": rights, "default_brain": default_brain}).execute()


return response.data

def create_brain_vector(self, vector_id):
def create_brain_vector(self, vector_id, file_sha1):
response = (
self.commons["supabase"]
.table("brains_vectors")
.insert({"brain_id": str(self.id), "vector_id": str(vector_id)})
.insert({"brain_id": str(self.id), "vector_id": str(vector_id), "file_sha1": file_sha1})
.execute()
)
return response.data
Expand All @@ -115,7 +113,7 @@ def update_brain_with_file(self, file_sha1: str):
# not used
vector_ids = self.get_vector_ids_from_file_sha1(file_sha1)
for vector_id in vector_ids:
self.create_brain_vector(vector_id)
self.create_brain_vector(vector_id, file_sha1)

def get_unique_brain_files(self):
"""
Expand All @@ -142,15 +140,24 @@ def get_unique_brain_files(self):

return self.files

def get_unique_files_from_vector_ids(self, vectors_ids : List[int]):
def get_unique_files_from_vector_ids(self, vectors_ids: List[int]):
# Move into Vectors class
"""
Retrieve unique user data vectors.
"""
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
print('vectors_ids', vectors_ids)
print('tuple(vectors_ids)', tuple(vectors_ids))
if len(vectors_ids) == 1:
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("id", "eq", vectors_ids[0])\
.execute()
else:
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("id", "in", tuple(vectors_ids))\
.execute()

documents = vectors_response.data # Access the data from the response
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]
Expand Down Expand Up @@ -187,6 +194,7 @@ def get_default_user_brain(user: User):
.execute()
)

print("Default brain response:", response.data)
default_brain_id = response.data[0]["brain_id"] if response.data else None

print(f"Default brain id: {default_brain_id}")
Expand Down
63 changes: 41 additions & 22 deletions backend/models/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from logger import get_logger
from models.brains import Brain
from models.settings import CommonsDep, common_dependencies
from pydantic import BaseModel
from utils.file import compute_sha1_from_file

logger = get_logger(__name__)


class File(BaseModel):
id: Optional[UUID] = None
file: Optional[UploadFile]
file_name: Optional[str] = ""
file_size: Optional[int] = ""
file_sha1: Optional[str] = ""
vectors_ids: Optional[int]=[]
vectors_ids: Optional[int] = []
file_extension: Optional[str] = ""
content: Optional[Any]= None
content: Optional[Any] = None
chunk_size: int = 500
chunk_overlap: int= 0
documents: Optional[Any]= None
chunk_overlap: int = 0
documents: Optional[Any] = None
_commons: Optional[CommonsDep] = None

def __init__(self, **kwargs):
Expand Down Expand Up @@ -56,7 +58,6 @@ def compute_documents(self, loader_class):

print("documents", documents)


os.remove(tmp_file.name)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
Expand All @@ -68,6 +69,11 @@ def compute_documents(self, loader_class):
print(self.documents)

def set_file_vectors_ids(self):
"""
Set the vectors_ids property with the ids of the vectors
that are associated with the file in the vectors table
"""

commons = common_dependencies()
response = (
commons["supabase"].table("vectors")
Expand All @@ -78,32 +84,45 @@ def set_file_vectors_ids(self):
self.vectors_ids = response.data
return

def file_already_exists(self, brain_id):
commons = common_dependencies()

def file_already_exists(self):
"""
Check if file already exists in vectors table
"""
self.set_file_vectors_ids()

print("file_sha1", self.file_sha1)
print("vectors_ids", self.vectors_ids)
print("len(vectors_ids)", len(self.vectors_ids))

# if the file does not exist in vectors then no need to go check in brains_vectors
if len(self.vectors_ids) == 0:
return False

for vector in self.vectors_ids:
response = (
commons["supabase"].table("brains_vectors")
.select("brain_id, vector_id")
.filter("brain_id", "eq", brain_id)
.filter("vector_id", "eq", vector['id'])
.execute()
)
print("response.data", response.data)
if len(response.data) == 0:
return False


return True

def file_already_exists_in_brain(self, brain_id):
commons = common_dependencies()
self.set_file_vectors_ids()
# Check if file exists in that brain
response = (
commons["supabase"].table("brains_vectors")
.select("brain_id, vector_id")
.filter("brain_id", "eq", brain_id)
.filter("file_sha1", "eq", self.file_sha1)
.execute()
)
print("response.data", response.data)
if len(response.data) == 0:
return False

return True

def file_is_empty(self):
return self.file.file._file.tell() < 1


def link_file_to_brain(self, brain: Brain):
self.set_file_vectors_ids()

for vector_id in self.vectors_ids:
brain.create_brain_vector(vector_id['id'], self.file_sha1)
print(f"Successfully linked file {self.file_sha1} to brain {brain.id}")
12 changes: 5 additions & 7 deletions backend/models/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,23 @@
from pydantic import BaseModel

logger = get_logger(__name__)


class User(BaseModel):
id: UUID
email: str
user_openai_api_key: str = None
requests_count: int = 0
user_openai_api_key: str = None


# [TODO] Rename the user table and its references to 'user_usage'
def create_user( self,date):
def create_user(self, date):

commons = common_dependencies()
logger.info(f"New user entry in db document for user {self.email}")

return(commons['supabase'].table("users").insert(
return (commons['supabase'].table("users").insert(
{"user_id": self.id, "email": self.email, "date": date, "requests_count": 1}).execute())


def get_user_request_stats(self):
commons = common_dependencies()
requests_stats = commons['supabase'].from_('users').select(
Expand All @@ -43,12 +42,11 @@ def fetch_user_requests_count(self, date):

return userItem["requests_count"]


def increment_user_request_count(self, date):
commons = common_dependencies()
requests_count = self.fetch_user_requests_count(date) + 1
logger.info(f"User {self.email} request count updated to {requests_count}")
commons['supabase'].table("users").update(
{ "requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
{"requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
self.requests_count = requests_count

3 changes: 2 additions & 1 deletion backend/parsers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ async def process_file(
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)

neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

created_vector_id = created_vector[0]

brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id)
brain.create_brain_vector(created_vector_id, file.file_sha1)

return

26 changes: 13 additions & 13 deletions backend/parsers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
random_dir_name = os.urandom(16).hex()
dateshort = time.strftime("%Y%m%d")
loader = GitLoader(
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
)
documents = loader.load()
os.system("rm -rf /tmp/" + random_dir_name)
Expand Down Expand Up @@ -44,21 +44,21 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)

file = File(file_sha1 = compute_sha1_from_content(doc.page_content.encode("utf-8")))
file = File(file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8")))

exist = file.file_already_exists(brain_id)
if not exist:
file_exists = file.file_already_exists()

if not file_exists:
print(f"Creating entry for file {file.file_sha1} in vectors...")
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
print("Created vector sids ", created_vector)
print("Created vector for ", doc.metadata["file_name"])

created_vector_id = created_vector[0]
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)

if not file_exists_in_brain:
file.add_file_to_brain(brain_id)
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id)

print("Created vector for ", doc.metadata["file_name"])
# add created_vector x brains in db


file.link_file_to_brain(brain)
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}

5 changes: 4 additions & 1 deletion backend/routes/brain_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,13 @@ async def brain_endpoint(
brain.create_brain()
default_brain = get_default_user_brain(current_user)
if default_brain:
# create a brain X user entry
logger.info(f"Default brain already exists for user {current_user.id}")
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=False)
else:
logger.info(f"Default brain does not exist for user {current_user.id}. It will be created.")
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=True)


return {"id": brain.id, "name": brain.name}

# update existing brain
Expand Down
4 changes: 2 additions & 2 deletions backend/routes/explore_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
explore_router = APIRouter()


@explore_router.get("/explore", dependencies=[Depends(AuthBearer())], tags=["Explore"])
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"),current_user: User = Depends(get_current_user)):
@explore_router.get("/explore/", dependencies=[Depends(AuthBearer())], tags=["Explore"])
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"), current_user: User = Depends(get_current_user)):
"""
Retrieve and explore unique user data vectors.
"""
Expand Down
31 changes: 22 additions & 9 deletions backend/utils/processors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from models.brains import Brain
from models.files import File
from models.settings import CommonsDep
from parsers.audio import process_audio
Expand Down Expand Up @@ -35,20 +36,32 @@
}


def create_response(message, type):
return {"message": message, "type": type}


async def filter_file(commons: CommonsDep, file: File, enable_summarization: bool, brain_id, openai_api_key):
await file.compute_file_sha1()

print("file sha1", file.file_sha1)
if file.file_already_exists( brain_id):
return {"message": f"🤔 {file.file.filename} already exists in brain {brain_id}.", "type": "warning"}
file_exists = file.file_already_exists()
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)

if file_exists_in_brain:
return create_response(f"🤔 {file.file.filename} already exists in brain {brain_id}.", "warning")
elif file.file_is_empty():
return {"message": f"❌ {file.file.filename} is empty.", "type": "error"}
else:
if file.file_extension in file_processors:
await file_processors[file.file_extension](commons,file, enable_summarization, brain_id ,openai_api_key )
return {"message": f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "type": "success"}
else:
return {"message": f"❌ {file.file.filename} is not supported.", "type": "error"}
return create_response(f"❌ {file.file.filename} is empty.", "error")
elif file_exists:
file.link_file_to_brain(brain=Brain(id=brain_id))
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")

if file.file_extension in file_processors:
try:
await file_processors[file.file_extension](commons, file, enable_summarization, brain_id, openai_api_key)
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")
except Exception as e:
# Add more specific exceptions as needed.
print(f"Error processing file: {e}")
return create_response(f"⚠️ An error occurred while processing {file.file.filename}.", "error")

return create_response(f"❌ {file.file.filename} is not supported.", "error")
Loading

0 comments on commit 4d9bd51

Please sign in to comment.