From 4c4e3ebfc2a3a93498ea08f4704650a455f21b68 Mon Sep 17 00:00:00 2001
From: Ian Johnson <enjalot@gmail.com>
Date: Thu, 29 Feb 2024 10:03:05 -0500
Subject: [PATCH] aligned umap has an align_id. embedding dimensions null
 checks

---
 latentscope/models/providers/openai.py       | 16 +++++++++++-----
 latentscope/models/providers/transformers.py |  2 +-
 latentscope/scripts/embed.py                 |  2 +-
 latentscope/scripts/umapper.py               | 16 ++++++++++------
 latentscope/server/jobs.py                   |  4 +++-
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/latentscope/models/providers/openai.py b/latentscope/models/providers/openai.py
index f41235a..91521e5 100644
--- a/latentscope/models/providers/openai.py
+++ b/latentscope/models/providers/openai.py
@@ -25,11 +25,17 @@ def embed(self, inputs, dimensions=None):
         max_tokens = self.params["max_tokens"]
         inputs = [b.replace("\n", " ") for b in inputs]
         inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
-        response = self.client.embeddings.create(
-            input=inputs,
-            model=self.name,
-            dimensions=dimensions
-        )
+        if dimensions is not None and dimensions > 0:
+            response = self.client.embeddings.create(
+                input=inputs,
+                model=self.name,
+                dimensions=dimensions
+            )
+        else:
+            response = self.client.embeddings.create(
+                input=inputs,
+                model=self.name
+            )
         embeddings = [embedding.embedding for embedding in response.data]
         return embeddings
 
diff --git a/latentscope/models/providers/transformers.py b/latentscope/models/providers/transformers.py
index 8439900..1ca816b 100644
--- a/latentscope/models/providers/transformers.py
+++ b/latentscope/models/providers/transformers.py
@@ -39,7 +39,7 @@ def embed(self, inputs, dimensions=None):
                 embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
 
         # Support Matroyshka embeddings
-        if dimensions is not None:
+        if dimensions is not None and dimensions > 0:
             embeddings = torch.nn.functional.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
             embeddings = embeddings[:, :dimensions]
 
diff --git a/latentscope/scripts/embed.py b/latentscope/scripts/embed.py
index ca5ebae..aedaaa8 100644
--- a/latentscope/scripts/embed.py
+++ b/latentscope/scripts/embed.py
@@ -53,7 +53,7 @@ def main():
     parser.add_argument('text_column', type=str, help='Output file', default='text')
     parser.add_argument('model_id', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5")
     parser.add_argument('--prefix', type=str, help='Prefix to prepend to text before embedding', default="")
-    parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings', default=None)
+    parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings')
     parser.add_argument('--rerun', type=str, help='Rerun the given embedding from last completed batch')
 
     # Parse arguments
diff --git a/latentscope/scripts/umapper.py b/latentscope/scripts/umapper.py
index 3a20441..ca20fce 100644
--- a/latentscope/scripts/umapper.py
+++ b/latentscope/scripts/umapper.py
@@ -68,7 +68,7 @@ def umapper(dataset_id, embedding_id, neighbors=25, min_dist=0.1, save=False, in
     umap_id = f"umap-{next_umap_number:03d}"
     print("RUNNING:", umap_id)
 
-    def process_umap_embeddings(umap_id, umap_embeddings, emb_id):
+    def process_umap_embeddings(umap_id, umap_embeddings, emb_id, align_id):
         min_values = np.min(umap_embeddings, axis=0)
         max_values = np.max(umap_embeddings, axis=0)
 
@@ -96,14 +96,18 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id):
 
         # save a json file with the umap parameters
         with open(os.path.join(umap_dir, f'{umap_id}.json'), 'w') as f:
-            json.dump({
+            meta = {
                 "id": umap_id, 
                 "embedding_id": emb_id,
                 "neighbors": neighbors, 
                 "min_dist": min_dist,
-                "init": init,
-                "align": f"{embedding_id},{align}" if align is not None else None,
-            }, f, indent=2)
+            }
+            if init:
+                meta["init"] = init,
+            if align:
+                meta["align"] = f"{embedding_id},{align}" if align is not None else None,
+                meta["align_id"] = align_id
+            json.dump(meta, f, indent=2)
         f.close()
 
 
@@ -139,7 +143,7 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id):
         print("ALIGNED", aligned)
         for i,emb in enumerate(a_embedding_ids):
             print("processing", emb, "umap", next_umap_number+i)
-            process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb)
+            process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb, umap_id)
 
         print("done with aligned umap")
         return 
diff --git a/latentscope/server/jobs.py b/latentscope/server/jobs.py
index 2c2edc1..b4b6e81 100644
--- a/latentscope/server/jobs.py
+++ b/latentscope/server/jobs.py
@@ -138,7 +138,9 @@ def run_embed():
     dimensions = request.args.get('dimensions')
 
     job_id = str(uuid.uuid4())
-    command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}" --dimensions={dimensions}'
+    command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}"'
+    if dimensions is not None:
+        command += f" --dimensions={dimensions}"
     threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
     return jsonify({"job_id": job_id})