From 3d04b1ede57a3116d14a128bbd6b4f19e66635d3 Mon Sep 17 00:00:00 2001 From: Egil Date: Thu, 10 Oct 2024 20:24:16 +0200 Subject: [PATCH 1/5] Outliers --- docetl/operations/outliers.py | 69 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 70 insertions(+) create mode 100644 docetl/operations/outliers.py diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py new file mode 100644 index 00000000..d131ddbd --- /dev/null +++ b/docetl/operations/outliers.py @@ -0,0 +1,69 @@ +from jinja2 import Environment, Template +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, List, Optional, Tuple +import numpy as np +from .base import BaseOperation +from .utils import RichLoopBar +from .clustering_utils import get_embeddings_for_clustering + +class OutliersOperation(BaseOperation): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.max_batch_size: int = self.config.get( + "max_batch_size", kwargs.get("max_batch_size", float("inf")) + ) + + def syntax_check(self) -> None: + """ + Checks the configuration of the OutlierOperation for required keys and valid structure. + + Raises: + ValueError: If required keys are missing + """ + + pass + + + def execute( + self, input_data: List[Dict], is_build: bool = False + ) -> Tuple[List[Dict], float]: + """ + Executes the cluster operation on the input data. Modifies the + input data and returns it in place. + + Args: + input_data (List[Dict]): A list of dictionaries to process. + is_build (bool): Whether the operation is being executed + in the build phase. Defaults to False. + + Returns: + Tuple[List[Dict], float]: A tuple containing the filtered + list of dictionaries and the total cost of the operation. + """ + + embeddings, cost = get_embeddings_for_clustering( + input_data, self.config, self.runner.api + ) + + embeddings = np.array(embeddings) + center = embeddings.mean(axis=0) + + distances = np.sqrt(((embeddings - center)**2).sum(axis=1)) + + if "percentile" in self.config: + distance_distribution = np.sort(distances) + cutoff = distance_distribution[int(self.config["percentile"] / 100. * (len(distance_distribution)-1))] + elif "std" in self.config: + cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"] + + include = distances <= cutoff + + return [ + item + for idx, item in enumerate(input_data) + if include[idx]], cost + diff --git a/pyproject.toml b/pyproject.toml index e0cf9bb2..f0d65afd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ reduce = "docetl.operations.reduce:ReduceOperation" resolve = "docetl.operations.resolve:ResolveOperation" gather = "docetl.operations.gather:GatherOperation" cluster = "docetl.operations.cluster:ClusterOperation" +outliers = "docetl.operations.outliers:OutliersOperation" [tool.poetry.plugins."docetl.parser"] llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader" From 2e4b8c37f6fcc253ea288491ca658a7a15be42f2 Mon Sep 17 00:00:00 2001 From: Egil Date: Fri, 11 Oct 2024 17:07:05 +0200 Subject: [PATCH 2/5] Changed api to look more like sample. Maybe these two should even be merged? --- docetl/operations/outliers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py index d131ddbd..93002112 100644 --- a/docetl/operations/outliers.py +++ b/docetl/operations/outliers.py @@ -54,9 +54,12 @@ def execute( distances = np.sqrt(((embeddings - center)**2).sum(axis=1)) - if "percentile" in self.config: + if "samples" in self.config: distance_distribution = np.sort(distances) - cutoff = distance_distribution[int(self.config["percentile"] / 100. * (len(distance_distribution)-1))] + samples = self.config["samples"] + if isinstance(samples, float): + samples = int(samples * (len(distance_distribution)-1)) + cutoff = distance_distribution[samples] elif "std" in self.config: cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"] From a3e53ab632473b7e18b15cd777c9b9c829d6f4ee Mon Sep 17 00:00:00 2001 From: Egil Date: Fri, 11 Oct 2024 17:35:54 +0200 Subject: [PATCH 3/5] More options for outliers --- docetl/operations/outliers.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py index 93002112..391c1ab3 100644 --- a/docetl/operations/outliers.py +++ b/docetl/operations/outliers.py @@ -48,9 +48,16 @@ def execute( embeddings, cost = get_embeddings_for_clustering( input_data, self.config, self.runner.api ) - embeddings = np.array(embeddings) - center = embeddings.mean(axis=0) + + if self.config.get("center", None) is not None: + center_embeddings, cost2 = get_embeddings_for_clustering( + [self.config["center"]], self.config, self.runner.api + ) + cost += cost2 + center = np.array(center_embeddings[0]) + else: + center = embeddings.mean(axis=0) distances = np.sqrt(((embeddings - center)**2).sum(axis=1)) @@ -62,9 +69,12 @@ def execute( cutoff = distance_distribution[samples] elif "std" in self.config: cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"] - - include = distances <= cutoff - + + if not self.config.get("keep", False): + include = distances <= cutoff + else: + include = distances > cutoff + return [ item for idx, item in enumerate(input_data) From 7a13061b110c39e1262020ef6fcbd1d0060b8c53 Mon Sep 17 00:00:00 2001 From: Egil Date: Fri, 11 Oct 2024 17:41:15 +0200 Subject: [PATCH 4/5] Added docs --- docs/operators/outliers.md | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 docs/operators/outliers.md diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md new file mode 100644 index 00000000..47203149 --- /dev/null +++ b/docs/operators/outliers.md @@ -0,0 +1,41 @@ +# Outliers operation + +The Outliers operation in DocETL removes outliers from the input (or +keeps only outliers). + +## 🚀 Example: + +```yaml +- name: remove-worst-10 + type: outliers + samples: 0.9 + embedding_keys: + - concept + - description +``` + +This will keep the 90 percent closest to the center (average) +embedding of the keys provided. Altermnatively, you could set samples +to an integer count of items to keep (or a negative number to throw +away). You can also assume a gaussian distribution and set the key std +to a number of standard deviations out from the center, instead of +setting samples. + +Small note about embeddings: If you embed too short values, some +embedding models will yield a very "sparse" distribution, where the +absolute majority of points lie on the surface of a hyperssphere, +meaning that this operation will not work very well! + +## Required Parameters + +- `name`: A unique name for the operation. +- `type`: Must be set to "sample". +- `samples`: Either a an integer count of samples, or a float fraction of samples. +- `embedding_keys`: A list of keys to use for the embedding distance calculation. + +## Optional Parameters + +| Parameter | Description | Default | +| ------------------------- | -------------------------------------------------------------------------------- | ----------------------------- | +| `keep` | If set to true, return the outliers instead of the non-outliers | false +| `center` | An explicit center object to be used to calculate the center embedding instead of using the average | The average embedding of all input data From b2ee5a2a81417d8967b264d0e3efce18e78663d5 Mon Sep 17 00:00:00 2001 From: Egil Date: Fri, 11 Oct 2024 17:44:07 +0200 Subject: [PATCH 5/5] Added more docs --- docs/operators/outliers.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md index 47203149..b137f578 100644 --- a/docs/operators/outliers.md +++ b/docs/operators/outliers.md @@ -26,6 +26,25 @@ embedding models will yield a very "sparse" distribution, where the absolute majority of points lie on the surface of a hyperssphere, meaning that this operation will not work very well! +### Using it as a poor-mans-RAG +```yaml +- name: remove-worst-10 + type: outliers + samples: 0.01 + embedding_keys: + - concept + - description + center: + concept: Horse + description: A horse is a large steppe roaming and grazing animal. Humans have utilized horses for transport throughout historical times +``` + +If center is provided, it must have the same keys as those listed +under embedding_keys, and their values will be used to calculate the +"center" embedding, instead of using the average of all embeddings of +the input items. This will effectively turn this into a search +operation for items similar to the center provided. + ## Required Parameters - `name`: A unique name for the operation.