From 3d04b1ede57a3116d14a128bbd6b4f19e66635d3 Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Thu, 10 Oct 2024 20:24:16 +0200
Subject: [PATCH 1/5] Outliers

---
 docetl/operations/outliers.py | 69 +++++++++++++++++++++++++++++++++++
 pyproject.toml                |  1 +
 2 files changed, 70 insertions(+)
 create mode 100644 docetl/operations/outliers.py

diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py
new file mode 100644
index 00000000..d131ddbd
--- /dev/null
+++ b/docetl/operations/outliers.py
@@ -0,0 +1,69 @@
+from jinja2 import Environment, Template
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from .base import BaseOperation
+from .utils import RichLoopBar
+from .clustering_utils import get_embeddings_for_clustering
+
+class OutliersOperation(BaseOperation):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.max_batch_size: int = self.config.get(
+            "max_batch_size", kwargs.get("max_batch_size", float("inf"))
+        )
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the OutlierOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing
+        """
+
+        pass
+
+    
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the cluster operation on the input data. Modifies the
+        input data and returns it in place.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed
+              in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the filtered
+              list of dictionaries and the total cost of the operation.
+        """
+        
+        embeddings, cost = get_embeddings_for_clustering(
+            input_data, self.config, self.runner.api
+        )
+
+        embeddings = np.array(embeddings)
+        center = embeddings.mean(axis=0)
+        
+        distances = np.sqrt(((embeddings - center)**2).sum(axis=1))
+
+        if "percentile" in self.config:
+            distance_distribution = np.sort(distances)
+            cutoff = distance_distribution[int(self.config["percentile"] / 100. * (len(distance_distribution)-1))]
+        elif "std" in self.config:
+            cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"]
+        
+        include = distances <= cutoff
+            
+        return [
+            item
+            for idx, item in enumerate(input_data)
+            if include[idx]], cost
+        
diff --git a/pyproject.toml b/pyproject.toml
index e0cf9bb2..f0d65afd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,6 +88,7 @@ reduce = "docetl.operations.reduce:ReduceOperation"
 resolve = "docetl.operations.resolve:ResolveOperation"
 gather = "docetl.operations.gather:GatherOperation"
 cluster = "docetl.operations.cluster:ClusterOperation"
+outliers = "docetl.operations.outliers:OutliersOperation"
 
 [tool.poetry.plugins."docetl.parser"]
 llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader"

From 2e4b8c37f6fcc253ea288491ca658a7a15be42f2 Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Fri, 11 Oct 2024 17:07:05 +0200
Subject: [PATCH 2/5] Changed api to look more like sample. Maybe these two
 should even be merged?

---
 docetl/operations/outliers.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py
index d131ddbd..93002112 100644
--- a/docetl/operations/outliers.py
+++ b/docetl/operations/outliers.py
@@ -54,9 +54,12 @@ def execute(
         
         distances = np.sqrt(((embeddings - center)**2).sum(axis=1))
 
-        if "percentile" in self.config:
+        if "samples" in self.config:
             distance_distribution = np.sort(distances)
-            cutoff = distance_distribution[int(self.config["percentile"] / 100. * (len(distance_distribution)-1))]
+            samples = self.config["samples"]
+            if isinstance(samples, float):
+                samples = int(samples * (len(distance_distribution)-1))
+            cutoff = distance_distribution[samples]
         elif "std" in self.config:
             cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"]
         

From a3e53ab632473b7e18b15cd777c9b9c829d6f4ee Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Fri, 11 Oct 2024 17:35:54 +0200
Subject: [PATCH 3/5] More options for outliers

---
 docetl/operations/outliers.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py
index 93002112..391c1ab3 100644
--- a/docetl/operations/outliers.py
+++ b/docetl/operations/outliers.py
@@ -48,9 +48,16 @@ def execute(
         embeddings, cost = get_embeddings_for_clustering(
             input_data, self.config, self.runner.api
         )
-
         embeddings = np.array(embeddings)
-        center = embeddings.mean(axis=0)
+
+        if self.config.get("center", None) is not None:
+            center_embeddings, cost2 = get_embeddings_for_clustering(
+                [self.config["center"]], self.config, self.runner.api
+            )
+            cost += cost2
+            center = np.array(center_embeddings[0])
+        else:
+            center = embeddings.mean(axis=0)
         
         distances = np.sqrt(((embeddings - center)**2).sum(axis=1))
 
@@ -62,9 +69,12 @@ def execute(
             cutoff = distance_distribution[samples]
         elif "std" in self.config:
             cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"]
-        
-        include = distances <= cutoff
-            
+
+        if not self.config.get("keep", False):
+            include = distances <= cutoff
+        else:
+            include = distances > cutoff
+
         return [
             item
             for idx, item in enumerate(input_data)

From 7a13061b110c39e1262020ef6fcbd1d0060b8c53 Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Fri, 11 Oct 2024 17:41:15 +0200
Subject: [PATCH 4/5] Added docs

---
 docs/operators/outliers.md | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 docs/operators/outliers.md

diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md
new file mode 100644
index 00000000..47203149
--- /dev/null
+++ b/docs/operators/outliers.md
@@ -0,0 +1,41 @@
+# Outliers operation
+
+The Outliers operation in DocETL removes outliers from the input (or
+keeps only outliers).
+
+## 🚀 Example: 
+
+```yaml
+- name: remove-worst-10
+  type: outliers
+  samples: 0.9
+  embedding_keys:
+   - concept
+   - description
+```
+
+This will keep the 90 percent closest to the center (average)
+embedding of the keys provided. Altermnatively, you could set samples
+to an integer count of items to keep (or a negative number to throw
+away). You can also assume a gaussian distribution and set the key std
+to a number of standard deviations out from the center, instead of
+setting samples.
+
+Small note about embeddings: If you embed too short values, some
+embedding models will yield a very "sparse" distribution, where the
+absolute majority of points lie on the surface of a hyperssphere,
+meaning that this operation will not work very well!
+
+## Required Parameters
+
+- `name`: A unique name for the operation.
+- `type`: Must be set to "sample".
+- `samples`: Either a an integer count of samples, or a float fraction of samples.
+- `embedding_keys`: A list of keys to use for the embedding distance calculation.
+
+## Optional Parameters
+
+| Parameter                 | Description                                                                      | Default                       |
+| ------------------------- | -------------------------------------------------------------------------------- | ----------------------------- |
+| `keep`                    | If set to true, return the outliers instead of the non-outliers | false
+| `center`                  | An explicit center object to be used to calculate the center embedding instead of using the average | The average embedding of all input data

From b2ee5a2a81417d8967b264d0e3efce18e78663d5 Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Fri, 11 Oct 2024 17:44:07 +0200
Subject: [PATCH 5/5] Added more docs

---
 docs/operators/outliers.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md
index 47203149..b137f578 100644
--- a/docs/operators/outliers.md
+++ b/docs/operators/outliers.md
@@ -26,6 +26,25 @@ embedding models will yield a very "sparse" distribution, where the
 absolute majority of points lie on the surface of a hyperssphere,
 meaning that this operation will not work very well!
 
+### Using it as a poor-mans-RAG
+```yaml
+- name: remove-worst-10
+  type: outliers
+  samples: 0.01
+  embedding_keys:
+   - concept
+   - description
+  center:
+    concept: Horse
+    description: A horse is a large steppe roaming and grazing animal. Humans have utilized horses for transport throughout historical times
+```
+
+If center is provided, it must have the same keys as those listed
+under embedding_keys, and their values will be used to calculate the
+"center" embedding, instead of using the average of all embeddings of
+the input items. This will effectively turn this into a search
+operation for items similar to the center provided.
+
 ## Required Parameters
 
 - `name`: A unique name for the operation.