diff --git a/Dockerfile b/Dockerfile index c13ecb9ef..e59376826 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,6 @@ FROM python:3.11-slim-bullseye # Allow statements and log messages to immediately appear in the Knative logs ENV PYTHONUNBUFFERED True -# Fixes "invalid instruction" runtime error on AMD machines (specifically HF Upgraded CPU Space). -ENV HNSWLIB_NO_NATIVE 1 # Adds GCC and other build tools so we can compile hnswlib and other native/C++ deps. RUN apt-get update --fix-missing && apt-get install -y --fix-missing build-essential && \ diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py index e88f96c0c..cb8ca09f3 100644 --- a/lilac/data/clustering.py +++ b/lilac/data/clustering.py @@ -449,29 +449,43 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]: output_path=cluster_output_path, sort_by=(*cluster_output_path, CATEGORY_ID), overwrite=True, - schema=field( - fields={ - CLUSTER_ID: field('int32', categorical=True), - CLUSTER_MEMBERSHIP_PROB: 'float32', - CLUSTER_TITLE: 'string', - CATEGORY_ID: field('int32', categorical=True), - CATEGORY_MEMBERSHIP_PROB: 'float32', - CATEGORY_TITLE: 'string', - }, - cluster=ClusterInfo( - min_cluster_size=min_cluster_size, - use_garden=use_garden, - input_path=(get_callable_name(input_fn_or_path),) if callable(input_fn_or_path) else path, - input_format_selector=ClusterInputFormatSelectorInfo( - format=manifest.dataset_format.name, - selector=dataset_format_input_selector.name, - ) - if dataset_format_input_selector and manifest.dataset_format - else None, - ), - ), ) + def drop_temp_text_column(items: Iterator[Item]) -> Iterator[Item]: + for item in items: + if TEXT_COLUMN in item: + del item[TEXT_COLUMN] + yield item + + # Drop the temporary newline-concatenated text column and write the final output. + dataset.transform( + drop_temp_text_column, + input_path=cluster_output_path, + output_path=cluster_output_path, + overwrite=True, + schema=field( + fields={ + CLUSTER_ID: field('int32', categorical=True), + CLUSTER_MEMBERSHIP_PROB: 'float32', + CLUSTER_TITLE: 'string', + CATEGORY_ID: field('int32', categorical=True), + CATEGORY_MEMBERSHIP_PROB: 'float32', + CATEGORY_TITLE: 'string', + }, + cluster=ClusterInfo( + min_cluster_size=min_cluster_size, + use_garden=use_garden, + input_path=(get_callable_name(input_fn_or_path),) if callable(input_fn_or_path) else path, + input_format_selector=ClusterInputFormatSelectorInfo( + format=manifest.dataset_format.name, + selector=dataset_format_input_selector.name, + ) + if dataset_format_input_selector and manifest.dataset_format + else None, + ), + ), + ) + def _hdbscan_cluster( docs: Iterator[str], diff --git a/lilac/hf_docker/Dockerfile b/lilac/hf_docker/Dockerfile index 4dddfa927..e9b697714 100644 --- a/lilac/hf_docker/Dockerfile +++ b/lilac/hf_docker/Dockerfile @@ -2,6 +2,8 @@ FROM python:3.11-slim-bullseye # Allow statements and log messages to immediately appear in the Knative logs ENV PYTHONUNBUFFERED True +# Fixes "invalid instruction" runtime error on AMD machines (specifically HF Upgraded CPU Space). +ENV HNSWLIB_NO_NATIVE 1 # Adds GCC and other build tools so we can compile hnswlib and other native/C++ deps. RUN apt-get update --fix-missing && apt-get install -y --fix-missing build-essential && \ diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py index d959afa30..7c90335cc 100644 --- a/lilac/router_dataset_signals.py +++ b/lilac/router_dataset_signals.py @@ -1,5 +1,4 @@ """Routing endpoints for running signals on datasets.""" -from threading import Thread from typing import Annotated, Optional from fastapi import APIRouter, HTTPException @@ -122,8 +121,7 @@ def run() -> None: task_id=task_id, ) - thread = Thread(target=run, daemon=True) - thread.start() + launch_task(task_id, run) return ClusterResponse(task_id=task_id)