Skip to content

Commit

Permalink
aligned umap has an align_id. embedding dimensions null checks
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Feb 29, 2024
1 parent 2e611fa commit 4c4e3eb
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 14 deletions.
16 changes: 11 additions & 5 deletions latentscope/models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,17 @@ def embed(self, inputs, dimensions=None):
max_tokens = self.params["max_tokens"]
inputs = [b.replace("\n", " ") for b in inputs]
inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
response = self.client.embeddings.create(
input=inputs,
model=self.name,
dimensions=dimensions
)
if dimensions is not None and dimensions > 0:
response = self.client.embeddings.create(
input=inputs,
model=self.name,
dimensions=dimensions
)
else:
response = self.client.embeddings.create(
input=inputs,
model=self.name
)
embeddings = [embedding.embedding for embedding in response.data]
return embeddings

Expand Down
2 changes: 1 addition & 1 deletion latentscope/models/providers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def embed(self, inputs, dimensions=None):
embeddings = mean_pooling(model_output, encoded_input["attention_mask"])

# Support Matroyshka embeddings
if dimensions is not None:
if dimensions is not None and dimensions > 0:
embeddings = torch.nn.functional.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, :dimensions]

Expand Down
2 changes: 1 addition & 1 deletion latentscope/scripts/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def main():
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model_id', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5")
parser.add_argument('--prefix', type=str, help='Prefix to prepend to text before embedding', default="")
parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings', default=None)
parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings')
parser.add_argument('--rerun', type=str, help='Rerun the given embedding from last completed batch')

# Parse arguments
Expand Down
16 changes: 10 additions & 6 deletions latentscope/scripts/umapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def umapper(dataset_id, embedding_id, neighbors=25, min_dist=0.1, save=False, in
umap_id = f"umap-{next_umap_number:03d}"
print("RUNNING:", umap_id)

def process_umap_embeddings(umap_id, umap_embeddings, emb_id):
def process_umap_embeddings(umap_id, umap_embeddings, emb_id, align_id):
min_values = np.min(umap_embeddings, axis=0)
max_values = np.max(umap_embeddings, axis=0)

Expand Down Expand Up @@ -96,14 +96,18 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id):

# save a json file with the umap parameters
with open(os.path.join(umap_dir, f'{umap_id}.json'), 'w') as f:
json.dump({
meta = {
"id": umap_id,
"embedding_id": emb_id,
"neighbors": neighbors,
"min_dist": min_dist,
"init": init,
"align": f"{embedding_id},{align}" if align is not None else None,
}, f, indent=2)
}
if init:
meta["init"] = init,
if align:
meta["align"] = f"{embedding_id},{align}" if align is not None else None,
meta["align_id"] = align_id
json.dump(meta, f, indent=2)
f.close()


Expand Down Expand Up @@ -139,7 +143,7 @@ def process_umap_embeddings(umap_id, umap_embeddings, emb_id):
print("ALIGNED", aligned)
for i,emb in enumerate(a_embedding_ids):
print("processing", emb, "umap", next_umap_number+i)
process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb)
process_umap_embeddings(f"umap-{next_umap_number+i:03d}", aligned[i], emb, umap_id)

print("done with aligned umap")
return
Expand Down
4 changes: 3 additions & 1 deletion latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ def run_embed():
dimensions = request.args.get('dimensions')

job_id = str(uuid.uuid4())
command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}" --dimensions={dimensions}'
command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}"'
if dimensions is not None:
command += f" --dimensions={dimensions}"
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

Expand Down

0 comments on commit 4c4e3eb

Please sign in to comment.