Skip to content

Commit

Permalink
feat: add latest danswer codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
JayGhiya committed Dec 20, 2024
1 parent 5c0c8c4 commit d4427d6
Show file tree
Hide file tree
Showing 508 changed files with 71,210 additions and 0 deletions.
11 changes: 11 additions & 0 deletions danswer/danswer_experiment/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
__pycache__/
.mypy_cache
.idea/
site_crawls/
.ipynb_checkpoints/
api_keys.py
*ipynb
.env*
vespa-app.zip
dynamic_config_storage/
celerybeat-schedule*
12 changes: 12 additions & 0 deletions danswer/danswer_experiment/.isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[settings]
known_third_party = aiohttp,alembic,asyncpg,atlassian_python_api,beautifulsoup4,boto3,celery,chardet,dask,ddtrace,distributed,fastapi,fastapi_users,fastapi_users_db_sqlalchemy,filelock,google_api_python_client,google_auth_httplib2,google_auth_oauthlib,httpcore,httpx,httpx_oauth,huggingface_hub,jira,jsonref,trafilatura,langchain,langchain_core,langchain_text_splitters,litellm,lxml,lxml_html_clean,llama_index,Mako,msal,nltk,Office365_REST_Python_Client,oauthlib,openai,openpyxl,playwright,psutil,psycopg2_binary,pycryptodome,pydantic,PyGithub,python_dateutil,python_gitlab,python_pptx,pypdf,pytest_mock,pytest_playwright,python_docx,python_dotenv,python_multipart,pywikibot,redis,requests,requests_oauthlib,retry,rfc3986,simple_salesforce,slack_sdk,SQLAlchemy,starlette,supervisor,tiktoken,timeago,transformers,unstructured,unstructured_client,uvicorn,zulip,hubspot_api_client,asana,dropbox,boto3_stubs,stripe,urllib3,mistune,sentry_sdk,prometheus_client
import_heading_stdlib = Standard Library
import_heading_thirdparty = Third Party
import_heading_firstparty = First Party
import_heading_localfolder = Local
py_version = 311 # For Python 3.12
multi_line_output = 5
line_length = 500
combine_as_imports = true
float_to_top = true
combine_as_imports = true
112 changes: 112 additions & 0 deletions danswer/danswer_experiment/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
FROM python:3.11.7-slim-bookworm

LABEL com.danswer.maintainer="[email protected]"
LABEL com.danswer.description="This image is the web/frontend container of Onyx which \
contains code for both the Community and Enterprise editions of Onyx. If you do not \
have a contract or agreement with DanswerAI, you are not permitted to use the Enterprise \
Edition features outside of personal development or testing purposes. Please reach out to \
[email protected] for more information. Please visit https://github.com/onyx-dot-app/onyx"

# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
ARG ONYX_VERSION=0.8-dev
ENV ONYX_VERSION=${ONYX_VERSION} \
DANSWER_RUNNING_IN_DOCKER="true"


RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
# Install system dependencies
# cmake needed for psycopg (postgres)
# libpq-dev needed for psycopg (postgres)
# curl included just for users' convenience
# zip for Vespa step futher down
# ca-certificates for HTTPS
RUN apt-get update && \
apt-get install -y \
cmake \
curl \
zip \
ca-certificates \
libgnutls30=3.7.9-2+deb12u3 \
libblkid1=2.38.1-5+deb12u1 \
libmount1=2.38.1-5+deb12u1 \
libsmartcols1=2.38.1-5+deb12u1 \
libuuid1=2.38.1-5+deb12u1 \
libxmlsec1-dev \
pkg-config \
gcc && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean



# Install Python dependencies
# Remove py which is pulled in by retry, py is not needed and is a CVE
COPY ./requirements/default.txt /tmp/requirements.txt
COPY ./requirements/ee.txt /tmp/ee-requirements.txt
RUN pip install --no-cache-dir --upgrade \
--retries 5 \
--timeout 30 \
-r /tmp/requirements.txt \
-r /tmp/ee-requirements.txt && \
pip uninstall -y py && \
playwright install chromium && \
playwright install-deps chromium && \
ln -s /usr/local/bin/supervisord /usr/bin/supervisord

# Cleanup for CVEs and size reduction
# https://github.com/tornadoweb/tornado/issues/3107
# xserver-common and xvfb included by playwright installation but not needed after
# perl-base is part of the base Python Debian image but not needed for Onyx functionality
# perl-base could only be removed with --allow-remove-essential
RUN apt-get update && \
apt-get remove -y --allow-remove-essential \
perl-base \
xserver-common \
xvfb \
cmake \
libldap-2.5-0 \
libxmlsec1-dev \
pkg-config \
gcc && \
apt-get install -y libxmlsec1-openssl && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/* && \
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key


# Pre-downloading models for setups with limited egress
RUN python -c "from tokenizers import Tokenizer; \
Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"

# Pre-downloading NLTK for setups with limited egress
RUN python -c "import nltk; \
nltk.download('stopwords', quiet=True); \
nltk.download('punkt', quiet=True);"
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed

# Set up application files
WORKDIR /app

# Enterprise Version Files
COPY ./ee /app/ee
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

# Set up application files
COPY ./onyx /app/onyx
COPY ./shared_configs /app/shared_configs
COPY ./alembic /app/alembic
COPY ./alembic_tenants /app/alembic_tenants
COPY ./alembic.ini /app/alembic.ini
COPY supervisord.conf /usr/etc/supervisord.conf

# Escape hatch
COPY ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connector_by_id.py

# Put logo in assets
COPY ./assets /app/assets

ENV PYTHONPATH=/app

# Default command which does nothing
# This container is used by api server and background which specify their own CMD
CMD ["tail", "-f", "/dev/null"]
60 changes: 60 additions & 0 deletions danswer/danswer_experiment/Dockerfile.model_server
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
FROM python:3.11.7-slim-bookworm

LABEL com.danswer.maintainer="[email protected]"
LABEL com.danswer.description="This image is for the Onyx model server which runs all of the \
AI models for Onyx. This container and all the code is MIT Licensed and free for all to use. \
You can find it at https://hub.docker.com/r/onyx/onyx-model-server. For more details, \
visit https://github.com/onyx-dot-app/onyx."

# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
ARG ONYX_VERSION=0.8-dev
ENV ONYX_VERSION=${ONYX_VERSION} \
DANSWER_RUNNING_IN_DOCKER="true"


RUN echo "ONYX_VERSION: ${ONYX_VERSION}"

COPY ./requirements/model_server.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade \
--retries 5 \
--timeout 30 \
-r /tmp/requirements.txt

RUN apt-get remove -y --allow-remove-essential perl-base && \
apt-get autoremove -y

# Pre-downloading models for setups with limited egress
# Download tokenizers, distilbert for the Onyx model
# Download model weights
# Run Nomic to pull in the custom architecture and have it cached locally
RUN python -c "from transformers import AutoTokenizer; \
AutoTokenizer.from_pretrained('distilbert-base-uncased'); \
AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
from huggingface_hub import snapshot_download; \
snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \
snapshot_download('nomic-ai/nomic-embed-text-v1'); \
snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
from sentence_transformers import SentenceTransformer; \
SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"

# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
# running Onyx, don't overwrite it with the built in cache folder
RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface

WORKDIR /app

# Utils used by model server
COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py

# Place to fetch version information
COPY ./onyx/__init__.py /app/onyx/__init__.py

# Shared between Onyx Backend and Model Server
COPY ./shared_configs /app/shared_configs

# Model Server main code
COPY ./model_server /app/model_server

ENV PYTHONPATH=/app

CMD ["uvicorn", "model_server.main:app", "--host", "0.0.0.0", "--port", "9000"]
4 changes: 4 additions & 0 deletions danswer/danswer_experiment/onyx/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Standard Library
import os

__version__ = os.environ.get("ONYX_VERSION", "") or "Development"
Empty file.
109 changes: 109 additions & 0 deletions danswer/danswer_experiment/onyx/access/access.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Third Party
from sqlalchemy.orm import Session

# First Party
from onyx.access.models import DocumentAccess
from onyx.access.utils import prefix_user_email
from onyx.configs.constants import PUBLIC_DOC_PAT
from onyx.db.document import get_access_info_for_document, get_access_info_for_documents
from onyx.db.models import User
from onyx.utils.variable_functionality import fetch_versioned_implementation


def _get_access_for_document(
document_id: str,
db_session: Session,
) -> DocumentAccess:
info = get_access_info_for_document(
db_session=db_session,
document_id=document_id,
)

return DocumentAccess.build(
user_emails=info[1] if info and info[1] else [],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=info[2] if info else False,
)


def get_access_for_document(
document_id: str,
db_session: Session,
) -> DocumentAccess:
versioned_get_access_for_document_fn = fetch_versioned_implementation(
"onyx.access.access", "_get_access_for_document"
)
return versioned_get_access_for_document_fn(document_id, db_session) # type: ignore


def get_null_document_access() -> DocumentAccess:
return DocumentAccess(
user_emails=set(),
user_groups=set(),
is_public=False,
external_user_emails=set(),
external_user_group_ids=set(),
)


def _get_access_for_documents(
document_ids: list[str],
db_session: Session,
) -> dict[str, DocumentAccess]:
document_access_info = get_access_info_for_documents(
db_session=db_session,
document_ids=document_ids,
)
doc_access = {
document_id: DocumentAccess(
user_emails=set([email for email in user_emails if email]),
# MIT version will wipe all groups and external groups on update
user_groups=set(),
is_public=is_public,
external_user_emails=set(),
external_user_group_ids=set(),
)
for document_id, user_emails, is_public in document_access_info
}

# Sometimes the document has not be indexed by the indexing job yet, in those cases
# the document does not exist and so we use least permissive. Specifically the EE version
# checks the MIT version permissions and creates a superset. This ensures that this flow
# does not fail even if the Document has not yet been indexed.
for doc_id in document_ids:
if doc_id not in doc_access:
doc_access[doc_id] = get_null_document_access()
return doc_access


def get_access_for_documents(
document_ids: list[str],
db_session: Session,
) -> dict[str, DocumentAccess]:
"""Fetches all access information for the given documents."""
versioned_get_access_for_documents_fn = fetch_versioned_implementation(
"onyx.access.access", "_get_access_for_documents"
)
return versioned_get_access_for_documents_fn(
document_ids, db_session
) # type: ignore


def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]:
"""Returns a list of ACL entries that the user has access to. This is meant to be
used downstream to filter out documents that the user does not have access to. The
user should have access to a document if at least one entry in the document's ACL
matches one entry in the returned set.
"""
if user:
return {prefix_user_email(user.email), PUBLIC_DOC_PAT}
return {PUBLIC_DOC_PAT}


def get_acl_for_user(user: User | None, db_session: Session | None = None) -> set[str]:
versioned_acl_for_user_fn = fetch_versioned_implementation(
"onyx.access.access", "_get_acl_for_user"
)
return versioned_acl_for_user_fn(user, db_session) # type: ignore
Loading

0 comments on commit d4427d6

Please sign in to comment.