From 2817f0e04d24fc6dfad8421b1d2cbe22fc19aa39 Mon Sep 17 00:00:00 2001 From: Michael Sekamanya <86433807+mawandm@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:58:49 -0700 Subject: [PATCH] fix(api): make migration columns nullable (#139) This PR - makes new document columns nullable - upgrades rag transformers and tokenizers to fix the error https://github.com/huggingface/transformers/issues/31789 Part of #108 --- .github/workflows/test_rag.yml | 2 ++ ...822101cb5_add_datasource_rag_processing_info_to_.py | 10 +++------- nesis/api/core/models/entities.py | 5 ++--- nesis/rag/Dockerfile | 1 + nesis/rag/requirements.txt | 5 +++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_rag.yml b/.github/workflows/test_rag.yml index fdf073e..b91e2a5 100644 --- a/.github/workflows/test_rag.yml +++ b/.github/workflows/test_rag.yml @@ -66,6 +66,8 @@ jobs: sudo apt update -y sudo apt install ffmpeg tesseract-ocr poppler-utils -y pip install -r nesis/rag/requirements.txt -r nesis/rag/requirements-test.txt -r nesis/rag/requirements-huggingface.txt + pip install -U transformers + pip install -U tokenizers - name: Run unit tests env: NESIS_MEMCACHE_HOSTS: localhost:11211 diff --git a/nesis/api/alembic/versions/090822101cb5_add_datasource_rag_processing_info_to_.py b/nesis/api/alembic/versions/090822101cb5_add_datasource_rag_processing_info_to_.py index 38d1e6a..8e7853c 100644 --- a/nesis/api/alembic/versions/090822101cb5_add_datasource_rag_processing_info_to_.py +++ b/nesis/api/alembic/versions/090822101cb5_add_datasource_rag_processing_info_to_.py @@ -37,15 +37,11 @@ def upgrade() -> None: op.add_column( "document", sa.Column( - "status", - sa.Enum("SUCCESS", "PROCESSING", "ERROR", name="document_status"), - nullable=False, + "status", sa.Enum("SUCCESS", "PROCESSING", "ERROR", name="document_status") ), ) - op.add_column("document", sa.Column("last_modified", sa.DateTime(), nullable=False)) - op.add_column( - "document", sa.Column("last_processed", sa.DateTime(), nullable=False) - ) + op.add_column("document", sa.Column("last_modified", sa.DateTime())) + op.add_column("document", sa.Column("last_processed", sa.DateTime())) op.add_column( "document", sa.Column("last_processed_message", sa.Text(), nullable=True) ) diff --git a/nesis/api/core/models/entities.py b/nesis/api/core/models/entities.py index abd8277..6152741 100644 --- a/nesis/api/core/models/entities.py +++ b/nesis/api/core/models/entities.py @@ -153,11 +153,10 @@ class Document(Base): store_metadata = Column(JSONB) status = Column( Enum(objects.DocumentStatus, name="document_status"), - nullable=False, default=objects.DocumentStatus.PROCESSING, ) - last_modified = Column(DateTime, default=dt.datetime.utcnow, nullable=False) - last_processed = Column(DateTime, default=dt.datetime.utcnow, nullable=False) + last_modified = Column(DateTime, default=dt.datetime.utcnow) + last_processed = Column(DateTime, default=dt.datetime.utcnow) last_processed_message = Column(Text) __table_args__ = ( diff --git a/nesis/rag/Dockerfile b/nesis/rag/Dockerfile index 59ac945..c01d9d1 100644 --- a/nesis/rag/Dockerfile +++ b/nesis/rag/Dockerfile @@ -20,6 +20,7 @@ RUN if [ "$CORE" = "cuda" ] ; \ --default-timeout=1200 ; \ fi +RUN /app/.venv/bin/pip install -U transformers tokenizers FROM python:3.11.6-slim-bookworm diff --git a/nesis/rag/requirements.txt b/nesis/rag/requirements.txt index 27fc738..8476645 100644 --- a/nesis/rag/requirements.txt +++ b/nesis/rag/requirements.txt @@ -17,14 +17,14 @@ Werkzeug==3.0.1 pandas==2.2.1 injector==0.21.0 -llama-index==0.10.23 +llama-index==0.10.52 llama-index-llms-openai-like==0.1.3 llama-index-readers-json==0.1.5 llama-index-vector-stores-postgres==0.1.4.post1 llama-index-vector-stores-chroma==0.1.6 llama-index-vector-stores-qdrant==0.1.4 llama-index-readers-file==0.1.12 -llama-index-llms-openai==0.1.12 +#llama-index-llms-openai==0.1.12 boto3==1.34.75 @@ -52,3 +52,4 @@ pillow_heif==0.16.0 # This causes conflicts from onnxruntime, so we attempt to install it last. Do not pin to a version so pip resolves it llama-index-embeddings-fastembed +