From 4c4e3ebfc2a3a93498ea08f4704650a455f21b68 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Thu, 29 Feb 2024 10:03:05 -0500 Subject: [PATCH] aligned umap has an align_id. embedding dimensions null checks --- latentscope/models/providers/openai.py | 16 +++++++++++----- latentscope/models/providers/transformers.py | 2 +- latentscope/scripts/embed.py | 2 +- latentscope/scripts/umapper.py | 16 ++++++++++------ latentscope/server/jobs.py | 4 +++- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/latentscope/models/providers/openai.py b/latentscope/models/providers/openai.py index f41235a..91521e5 100644 --- a/latentscope/models/providers/openai.py +++ b/latentscope/models/providers/openai.py @@ -25,11 +25,17 @@ def embed(self, inputs, dimensions=None): max_tokens = self.params["max_tokens"] inputs = [b.replace("\n", " ") for b in inputs] inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs] - response = self.client.embeddings.create( - input=inputs, - model=self.name, - dimensions=dimensions - ) + if dimensions is not None and dimensions > 0: + response = self.client.embeddings.create( + input=inputs, + model=self.name, + dimensions=dimensions + ) + else: + response = self.client.embeddings.create( + input=inputs, + model=self.name + ) embeddings = [embedding.embedding for embedding in response.data] return embeddings diff --git a/latentscope/models/providers/transformers.py b/latentscope/models/providers/transformers.py index 8439900..1ca816b 100644 --- a/latentscope/models/providers/transformers.py +++ b/latentscope/models/providers/transformers.py @@ -39,7 +39,7 @@ def embed(self, inputs, dimensions=None): embeddings = mean_pooling(model_output, encoded_input["attention_mask"]) # Support Matroyshka embeddings - if dimensions is not None: + if dimensions is not None and dimensions > 0: embeddings = torch.nn.functional.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) embeddings = embeddings[:, :dimensions] diff --git a/latentscope/scripts/embed.py b/latentscope/scripts/embed.py index ca5ebae..aedaaa8 100644 --- a/latentscope/scripts/embed.py +++ b/latentscope/scripts/embed.py @@ -53,7 +53,7 @@ def main(): parser.add_argument('text_column', type=str, help='Output file', default='text') parser.add_argument('model_id', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5") parser.add_argument('--prefix', type=str, help='Prefix to prepend to text before embedding', default="") - parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings', default=None) + parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings') parser.add_argument('--rerun', type=str, help='Rerun the given embedding from last completed batch') # Parse arguments diff --git a/latentscope/scripts/umapper.py b/latentscope/scripts/umapper.py index 3a20441..ca20fce 100644 --- a/latentscope/scripts/umapper.py +++ b/latentscope/scripts/umapper.py @@ -68,7 +68,7 @@ def umapper(dataset_id, embedding_id, neighbors=25, min_dist=0.1, save=False, in umap_id = f"umap-{next_umap_number:03d}" print("RUNNING:", umap_id) - def process_umap_embeddings(umap_id, umap_embeddings, emb_id): + def process_umap_embeddings(umap_id, umap_embeddings, emb_id, align_id): min_values = np.min(umap_embeddings, axis=0) max_values = np.max(umap_embeddings, axis=0) @@ -96,14 +96,18 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id): # save a json file with the umap parameters with open(os.path.join(umap_dir, f'{umap_id}.json'), 'w') as f: - json.dump({ + meta = { "id": umap_id, "embedding_id": emb_id, "neighbors": neighbors, "min_dist": min_dist, - "init": init, - "align": f"{embedding_id},{align}" if align is not None else None, - }, f, indent=2) + } + if init: + meta["init"] = init, + if align: + meta["align"] = f"{embedding_id},{align}" if align is not None else None, + meta["align_id"] = align_id + json.dump(meta, f, indent=2) f.close() @@ -139,7 +143,7 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id): print("ALIGNED", aligned) for i,emb in enumerate(a_embedding_ids): print("processing", emb, "umap", next_umap_number+i) - process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb) + process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb, umap_id) print("done with aligned umap") return diff --git a/latentscope/server/jobs.py b/latentscope/server/jobs.py index 2c2edc1..b4b6e81 100644 --- a/latentscope/server/jobs.py +++ b/latentscope/server/jobs.py @@ -138,7 +138,9 @@ def run_embed(): dimensions = request.args.get('dimensions') job_id = str(uuid.uuid4()) - command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}" --dimensions={dimensions}' + command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}"' + if dimensions is not None: + command += f" --dimensions={dimensions}" threading.Thread(target=run_job, args=(dataset, job_id, command)).start() return jsonify({"job_id": job_id})