Skip to content

Commit

Permalink
Couple of small bug fixes (#1109)
Browse files Browse the repository at this point in the history
- Fix progress completed for clustering, and always re-add the
`field.cluster` info, if the user re-runs without overwrite
- Fix the hnswlib bug on HF
dsmilkov authored Jan 22, 2024
1 parent 55adffc commit 2996b82
Showing 4 changed files with 38 additions and 26 deletions.
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -2,8 +2,6 @@ FROM python:3.11-slim-bullseye

# Allow statements and log messages to immediately appear in the Knative logs
ENV PYTHONUNBUFFERED True
# Fixes "invalid instruction" runtime error on AMD machines (specifically HF Upgraded CPU Space).
ENV HNSWLIB_NO_NATIVE 1

# Adds GCC and other build tools so we can compile hnswlib and other native/C++ deps.
RUN apt-get update --fix-missing && apt-get install -y --fix-missing build-essential && \
56 changes: 35 additions & 21 deletions lilac/data/clustering.py
Original file line number Diff line number Diff line change
@@ -449,29 +449,43 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]:
output_path=cluster_output_path,
sort_by=(*cluster_output_path, CATEGORY_ID),
overwrite=True,
schema=field(
fields={
CLUSTER_ID: field('int32', categorical=True),
CLUSTER_MEMBERSHIP_PROB: 'float32',
CLUSTER_TITLE: 'string',
CATEGORY_ID: field('int32', categorical=True),
CATEGORY_MEMBERSHIP_PROB: 'float32',
CATEGORY_TITLE: 'string',
},
cluster=ClusterInfo(
min_cluster_size=min_cluster_size,
use_garden=use_garden,
input_path=(get_callable_name(input_fn_or_path),) if callable(input_fn_or_path) else path,
input_format_selector=ClusterInputFormatSelectorInfo(
format=manifest.dataset_format.name,
selector=dataset_format_input_selector.name,
)
if dataset_format_input_selector and manifest.dataset_format
else None,
),
),
)

def drop_temp_text_column(items: Iterator[Item]) -> Iterator[Item]:
for item in items:
if TEXT_COLUMN in item:
del item[TEXT_COLUMN]
yield item

# Drop the temporary newline-concatenated text column and write the final output.
dataset.transform(
drop_temp_text_column,
input_path=cluster_output_path,
output_path=cluster_output_path,
overwrite=True,
schema=field(
fields={
CLUSTER_ID: field('int32', categorical=True),
CLUSTER_MEMBERSHIP_PROB: 'float32',
CLUSTER_TITLE: 'string',
CATEGORY_ID: field('int32', categorical=True),
CATEGORY_MEMBERSHIP_PROB: 'float32',
CATEGORY_TITLE: 'string',
},
cluster=ClusterInfo(
min_cluster_size=min_cluster_size,
use_garden=use_garden,
input_path=(get_callable_name(input_fn_or_path),) if callable(input_fn_or_path) else path,
input_format_selector=ClusterInputFormatSelectorInfo(
format=manifest.dataset_format.name,
selector=dataset_format_input_selector.name,
)
if dataset_format_input_selector and manifest.dataset_format
else None,
),
),
)


def _hdbscan_cluster(
docs: Iterator[str],
2 changes: 2 additions & 0 deletions lilac/hf_docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -2,6 +2,8 @@ FROM python:3.11-slim-bullseye

# Allow statements and log messages to immediately appear in the Knative logs
ENV PYTHONUNBUFFERED True
# Fixes "invalid instruction" runtime error on AMD machines (specifically HF Upgraded CPU Space).
ENV HNSWLIB_NO_NATIVE 1

# Adds GCC and other build tools so we can compile hnswlib and other native/C++ deps.
RUN apt-get update --fix-missing && apt-get install -y --fix-missing build-essential && \
4 changes: 1 addition & 3 deletions lilac/router_dataset_signals.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Routing endpoints for running signals on datasets."""
from threading import Thread
from typing import Annotated, Optional

from fastapi import APIRouter, HTTPException
@@ -122,8 +121,7 @@ def run() -> None:
task_id=task_id,
)

thread = Thread(target=run, daemon=True)
thread.start()
launch_task(task_id, run)
return ClusterResponse(task_id=task_id)


0 comments on commit 2996b82

Please sign in to comment.