-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
508 changed files
with
71,210 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
__pycache__/ | ||
.mypy_cache | ||
.idea/ | ||
site_crawls/ | ||
.ipynb_checkpoints/ | ||
api_keys.py | ||
*ipynb | ||
.env* | ||
vespa-app.zip | ||
dynamic_config_storage/ | ||
celerybeat-schedule* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[settings] | ||
known_third_party = aiohttp,alembic,asyncpg,atlassian_python_api,beautifulsoup4,boto3,celery,chardet,dask,ddtrace,distributed,fastapi,fastapi_users,fastapi_users_db_sqlalchemy,filelock,google_api_python_client,google_auth_httplib2,google_auth_oauthlib,httpcore,httpx,httpx_oauth,huggingface_hub,jira,jsonref,trafilatura,langchain,langchain_core,langchain_text_splitters,litellm,lxml,lxml_html_clean,llama_index,Mako,msal,nltk,Office365_REST_Python_Client,oauthlib,openai,openpyxl,playwright,psutil,psycopg2_binary,pycryptodome,pydantic,PyGithub,python_dateutil,python_gitlab,python_pptx,pypdf,pytest_mock,pytest_playwright,python_docx,python_dotenv,python_multipart,pywikibot,redis,requests,requests_oauthlib,retry,rfc3986,simple_salesforce,slack_sdk,SQLAlchemy,starlette,supervisor,tiktoken,timeago,transformers,unstructured,unstructured_client,uvicorn,zulip,hubspot_api_client,asana,dropbox,boto3_stubs,stripe,urllib3,mistune,sentry_sdk,prometheus_client | ||
import_heading_stdlib = Standard Library | ||
import_heading_thirdparty = Third Party | ||
import_heading_firstparty = First Party | ||
import_heading_localfolder = Local | ||
py_version = 311 # For Python 3.12 | ||
multi_line_output = 5 | ||
line_length = 500 | ||
combine_as_imports = true | ||
float_to_top = true | ||
combine_as_imports = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
FROM python:3.11.7-slim-bookworm | ||
|
||
LABEL com.danswer.maintainer="[email protected]" | ||
LABEL com.danswer.description="This image is the web/frontend container of Onyx which \ | ||
contains code for both the Community and Enterprise editions of Onyx. If you do not \ | ||
have a contract or agreement with DanswerAI, you are not permitted to use the Enterprise \ | ||
Edition features outside of personal development or testing purposes. Please reach out to \ | ||
[email protected] for more information. Please visit https://github.com/onyx-dot-app/onyx" | ||
|
||
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions. | ||
ARG ONYX_VERSION=0.8-dev | ||
ENV ONYX_VERSION=${ONYX_VERSION} \ | ||
DANSWER_RUNNING_IN_DOCKER="true" | ||
|
||
|
||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}" | ||
# Install system dependencies | ||
# cmake needed for psycopg (postgres) | ||
# libpq-dev needed for psycopg (postgres) | ||
# curl included just for users' convenience | ||
# zip for Vespa step futher down | ||
# ca-certificates for HTTPS | ||
RUN apt-get update && \ | ||
apt-get install -y \ | ||
cmake \ | ||
curl \ | ||
zip \ | ||
ca-certificates \ | ||
libgnutls30=3.7.9-2+deb12u3 \ | ||
libblkid1=2.38.1-5+deb12u1 \ | ||
libmount1=2.38.1-5+deb12u1 \ | ||
libsmartcols1=2.38.1-5+deb12u1 \ | ||
libuuid1=2.38.1-5+deb12u1 \ | ||
libxmlsec1-dev \ | ||
pkg-config \ | ||
gcc && \ | ||
rm -rf /var/lib/apt/lists/* && \ | ||
apt-get clean | ||
|
||
|
||
|
||
# Install Python dependencies | ||
# Remove py which is pulled in by retry, py is not needed and is a CVE | ||
COPY ./requirements/default.txt /tmp/requirements.txt | ||
COPY ./requirements/ee.txt /tmp/ee-requirements.txt | ||
RUN pip install --no-cache-dir --upgrade \ | ||
--retries 5 \ | ||
--timeout 30 \ | ||
-r /tmp/requirements.txt \ | ||
-r /tmp/ee-requirements.txt && \ | ||
pip uninstall -y py && \ | ||
playwright install chromium && \ | ||
playwright install-deps chromium && \ | ||
ln -s /usr/local/bin/supervisord /usr/bin/supervisord | ||
|
||
# Cleanup for CVEs and size reduction | ||
# https://github.com/tornadoweb/tornado/issues/3107 | ||
# xserver-common and xvfb included by playwright installation but not needed after | ||
# perl-base is part of the base Python Debian image but not needed for Onyx functionality | ||
# perl-base could only be removed with --allow-remove-essential | ||
RUN apt-get update && \ | ||
apt-get remove -y --allow-remove-essential \ | ||
perl-base \ | ||
xserver-common \ | ||
xvfb \ | ||
cmake \ | ||
libldap-2.5-0 \ | ||
libxmlsec1-dev \ | ||
pkg-config \ | ||
gcc && \ | ||
apt-get install -y libxmlsec1-openssl && \ | ||
apt-get autoremove -y && \ | ||
rm -rf /var/lib/apt/lists/* && \ | ||
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key | ||
|
||
|
||
# Pre-downloading models for setups with limited egress | ||
RUN python -c "from tokenizers import Tokenizer; \ | ||
Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')" | ||
|
||
# Pre-downloading NLTK for setups with limited egress | ||
RUN python -c "import nltk; \ | ||
nltk.download('stopwords', quiet=True); \ | ||
nltk.download('punkt', quiet=True);" | ||
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed | ||
|
||
# Set up application files | ||
WORKDIR /app | ||
|
||
# Enterprise Version Files | ||
COPY ./ee /app/ee | ||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf | ||
|
||
# Set up application files | ||
COPY ./onyx /app/onyx | ||
COPY ./shared_configs /app/shared_configs | ||
COPY ./alembic /app/alembic | ||
COPY ./alembic_tenants /app/alembic_tenants | ||
COPY ./alembic.ini /app/alembic.ini | ||
COPY supervisord.conf /usr/etc/supervisord.conf | ||
|
||
# Escape hatch | ||
COPY ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connector_by_id.py | ||
|
||
# Put logo in assets | ||
COPY ./assets /app/assets | ||
|
||
ENV PYTHONPATH=/app | ||
|
||
# Default command which does nothing | ||
# This container is used by api server and background which specify their own CMD | ||
CMD ["tail", "-f", "/dev/null"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
FROM python:3.11.7-slim-bookworm | ||
|
||
LABEL com.danswer.maintainer="[email protected]" | ||
LABEL com.danswer.description="This image is for the Onyx model server which runs all of the \ | ||
AI models for Onyx. This container and all the code is MIT Licensed and free for all to use. \ | ||
You can find it at https://hub.docker.com/r/onyx/onyx-model-server. For more details, \ | ||
visit https://github.com/onyx-dot-app/onyx." | ||
|
||
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions. | ||
ARG ONYX_VERSION=0.8-dev | ||
ENV ONYX_VERSION=${ONYX_VERSION} \ | ||
DANSWER_RUNNING_IN_DOCKER="true" | ||
|
||
|
||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}" | ||
|
||
COPY ./requirements/model_server.txt /tmp/requirements.txt | ||
RUN pip install --no-cache-dir --upgrade \ | ||
--retries 5 \ | ||
--timeout 30 \ | ||
-r /tmp/requirements.txt | ||
|
||
RUN apt-get remove -y --allow-remove-essential perl-base && \ | ||
apt-get autoremove -y | ||
|
||
# Pre-downloading models for setups with limited egress | ||
# Download tokenizers, distilbert for the Onyx model | ||
# Download model weights | ||
# Run Nomic to pull in the custom architecture and have it cached locally | ||
RUN python -c "from transformers import AutoTokenizer; \ | ||
AutoTokenizer.from_pretrained('distilbert-base-uncased'); \ | ||
AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \ | ||
from huggingface_hub import snapshot_download; \ | ||
snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \ | ||
snapshot_download('nomic-ai/nomic-embed-text-v1'); \ | ||
snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \ | ||
from sentence_transformers import SentenceTransformer; \ | ||
SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);" | ||
|
||
# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while | ||
# running Onyx, don't overwrite it with the built in cache folder | ||
RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface | ||
|
||
WORKDIR /app | ||
|
||
# Utils used by model server | ||
COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py | ||
|
||
# Place to fetch version information | ||
COPY ./onyx/__init__.py /app/onyx/__init__.py | ||
|
||
# Shared between Onyx Backend and Model Server | ||
COPY ./shared_configs /app/shared_configs | ||
|
||
# Model Server main code | ||
COPY ./model_server /app/model_server | ||
|
||
ENV PYTHONPATH=/app | ||
|
||
CMD ["uvicorn", "model_server.main:app", "--host", "0.0.0.0", "--port", "9000"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Standard Library | ||
import os | ||
|
||
__version__ = os.environ.get("ONYX_VERSION", "") or "Development" |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# Third Party | ||
from sqlalchemy.orm import Session | ||
|
||
# First Party | ||
from onyx.access.models import DocumentAccess | ||
from onyx.access.utils import prefix_user_email | ||
from onyx.configs.constants import PUBLIC_DOC_PAT | ||
from onyx.db.document import get_access_info_for_document, get_access_info_for_documents | ||
from onyx.db.models import User | ||
from onyx.utils.variable_functionality import fetch_versioned_implementation | ||
|
||
|
||
def _get_access_for_document( | ||
document_id: str, | ||
db_session: Session, | ||
) -> DocumentAccess: | ||
info = get_access_info_for_document( | ||
db_session=db_session, | ||
document_id=document_id, | ||
) | ||
|
||
return DocumentAccess.build( | ||
user_emails=info[1] if info and info[1] else [], | ||
user_groups=[], | ||
external_user_emails=[], | ||
external_user_group_ids=[], | ||
is_public=info[2] if info else False, | ||
) | ||
|
||
|
||
def get_access_for_document( | ||
document_id: str, | ||
db_session: Session, | ||
) -> DocumentAccess: | ||
versioned_get_access_for_document_fn = fetch_versioned_implementation( | ||
"onyx.access.access", "_get_access_for_document" | ||
) | ||
return versioned_get_access_for_document_fn(document_id, db_session) # type: ignore | ||
|
||
|
||
def get_null_document_access() -> DocumentAccess: | ||
return DocumentAccess( | ||
user_emails=set(), | ||
user_groups=set(), | ||
is_public=False, | ||
external_user_emails=set(), | ||
external_user_group_ids=set(), | ||
) | ||
|
||
|
||
def _get_access_for_documents( | ||
document_ids: list[str], | ||
db_session: Session, | ||
) -> dict[str, DocumentAccess]: | ||
document_access_info = get_access_info_for_documents( | ||
db_session=db_session, | ||
document_ids=document_ids, | ||
) | ||
doc_access = { | ||
document_id: DocumentAccess( | ||
user_emails=set([email for email in user_emails if email]), | ||
# MIT version will wipe all groups and external groups on update | ||
user_groups=set(), | ||
is_public=is_public, | ||
external_user_emails=set(), | ||
external_user_group_ids=set(), | ||
) | ||
for document_id, user_emails, is_public in document_access_info | ||
} | ||
|
||
# Sometimes the document has not be indexed by the indexing job yet, in those cases | ||
# the document does not exist and so we use least permissive. Specifically the EE version | ||
# checks the MIT version permissions and creates a superset. This ensures that this flow | ||
# does not fail even if the Document has not yet been indexed. | ||
for doc_id in document_ids: | ||
if doc_id not in doc_access: | ||
doc_access[doc_id] = get_null_document_access() | ||
return doc_access | ||
|
||
|
||
def get_access_for_documents( | ||
document_ids: list[str], | ||
db_session: Session, | ||
) -> dict[str, DocumentAccess]: | ||
"""Fetches all access information for the given documents.""" | ||
versioned_get_access_for_documents_fn = fetch_versioned_implementation( | ||
"onyx.access.access", "_get_access_for_documents" | ||
) | ||
return versioned_get_access_for_documents_fn( | ||
document_ids, db_session | ||
) # type: ignore | ||
|
||
|
||
def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]: | ||
"""Returns a list of ACL entries that the user has access to. This is meant to be | ||
used downstream to filter out documents that the user does not have access to. The | ||
user should have access to a document if at least one entry in the document's ACL | ||
matches one entry in the returned set. | ||
""" | ||
if user: | ||
return {prefix_user_email(user.email), PUBLIC_DOC_PAT} | ||
return {PUBLIC_DOC_PAT} | ||
|
||
|
||
def get_acl_for_user(user: User | None, db_session: Session | None = None) -> set[str]: | ||
versioned_acl_for_user_fn = fetch_versioned_implementation( | ||
"onyx.access.access", "_get_acl_for_user" | ||
) | ||
return versioned_acl_for_user_fn(user, db_session) # type: ignore |
Oops, something went wrong.