Skip to content

Commit

Permalink
fix issue with --init umap. add ui for align umap and truncate embedd…
Browse files Browse the repository at this point in the history
…ings. start adding info tooltips.
  • Loading branch information
enjalot committed Mar 4, 2024
1 parent 039c3bf commit deb1b8f
Show file tree
Hide file tree
Showing 10 changed files with 210 additions and 18 deletions.
10 changes: 7 additions & 3 deletions latentscope/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,13 @@ def main():
parser.add_argument('umap_id', type=str, help='ID of the UMAP file')
parser.add_argument('samples', type=int, help='Minimum cluster size')
parser.add_argument('min_samples', type=int, help='Minimum samples for HDBSCAN')
parser.add_argument('cluster_selection_epsilon', type=float, help='Cluster selection Epsilon', default=0)

args = parser.parse_args()
clusterer(args.dataset_id, args.umap_id, args.samples, args.min_samples)
clusterer(args.dataset_id, args.umap_id, args.samples, args.min_samples, args.cluster_selection_epsilon)


def clusterer(dataset_id, umap_id, samples, min_samples):
def clusterer(dataset_id, umap_id, samples, min_samples, cluster_selection_epsilon):
DATA_DIR = get_data_dir()
cluster_dir = os.path.join(DATA_DIR, dataset_id, "clusters")
# Check if clusters directory exists, if not, create it
Expand All @@ -59,7 +60,7 @@ def clusterer(dataset_id, umap_id, samples, min_samples):
umap_embeddings_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "umaps", f"{umap_id}.parquet"))
umap_embeddings = umap_embeddings_df.to_numpy()

clusterer = hdbscan.HDBSCAN(min_cluster_size=samples, min_samples=min_samples, metric='euclidean')
clusterer = hdbscan.HDBSCAN(min_cluster_size=samples, min_samples=min_samples, metric='euclidean', cluster_selection_epsilon=cluster_selection_epsilon)
clusterer.fit(umap_embeddings)

# Get the cluster labels
Expand All @@ -72,6 +73,8 @@ def clusterer(dataset_id, umap_id, samples, min_samples):
non_noise_labels = unique_labels[unique_labels != -1]
centroids = [umap_embeddings[cluster_labels == label].mean(axis=0) for label in non_noise_labels]

# TODO: look into soft clustering
# https://hdbscan.readthedocs.io/en/latest/soft_clustering.html
# Assign noise points to the closest cluster centroid
noise_points = umap_embeddings[cluster_labels == -1]
if(non_noise_labels.shape[0] > 0):
Expand Down Expand Up @@ -120,6 +123,7 @@ def clusterer(dataset_id, umap_id, samples, min_samples):
"umap_id": umap_id,
"samples": samples,
"min_samples": min_samples,
"cluster_selection_epsilon": cluster_selection_epsilon,
"n_clusters": len(non_noise_labels),
"n_noise": len(noise_points)
}, f, indent=2)
Expand Down
26 changes: 22 additions & 4 deletions latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,17 @@ def run_embed():
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

@jobs_write_bp.route('/embed_truncate')
def run_embed_truncate():
dataset = request.args.get('dataset')
embedding_id = request.args.get('embedding_id') # model id
dimensions = request.args.get('dimensions')

job_id = str(uuid.uuid4())
command = f'ls-embed-truncate {dataset} {embedding_id} {dimensions}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

@jobs_write_bp.route('/rerun')
def rerun_job():
dataset = request.args.get('dataset')
Expand Down Expand Up @@ -210,10 +221,16 @@ def run_umap():
neighbors = request.args.get('neighbors')
min_dist = request.args.get('min_dist')
init = request.args.get('init')
print("run umap", dataset, embedding_id, neighbors, min_dist, init)
align = request.args.get('align')
print("run umap", dataset, embedding_id, neighbors, min_dist, init, align)

job_id = str(uuid.uuid4())
command = f'ls-umap {dataset} {embedding_id} {neighbors} {min_dist} --init={init}'
command = f'ls-umap {dataset} {embedding_id} {neighbors} {min_dist}'
if init:
command += f' --init={init}'
if align:
command += f' --align={align}'

threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

Expand Down Expand Up @@ -249,10 +266,11 @@ def run_cluster():
umap_id = request.args.get('umap_id')
samples = request.args.get('samples')
min_samples = request.args.get('min_samples')
print("run cluster", dataset, umap_id, samples, min_samples)
cluster_selection_epsilon = request.args.get('cluster_selection_epsilon')
print("run cluster", dataset, umap_id, samples, min_samples, cluster_selection_epsilon)

job_id = str(uuid.uuid4())
command = f'ls-cluster {dataset} {umap_id} {samples} {min_samples}'
command = f'ls-cluster {dataset} {umap_id} {samples} {min_samples} {cluster_selection_epsilon}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

Expand Down
41 changes: 41 additions & 0 deletions web/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-router-dom": "^6.20.1",
"react-tooltip": "^5.26.3",
"regl-scatterplot": "^1.8.5",
"request": "^2.88.2"
},
Expand Down
30 changes: 25 additions & 5 deletions web/src/components/Setup/Cluster.jsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// NewEmbedding.jsx
import { useState, useEffect, useCallback} from 'react';
import { Tooltip } from 'react-tooltip';
import JobProgress from '../Job/Progress';
import { useStartJobPolling } from '../Job/Run';
const apiUrl = import.meta.env.VITE_API_URL
Expand Down Expand Up @@ -38,7 +39,7 @@ function Cluster({ dataset, cluster, umap, onNew, onChange}) {
}
})
// console.log("clusters", clusters)
callback(array.reverse())
callback(array)
});
}

Expand Down Expand Up @@ -70,20 +71,38 @@ function Cluster({ dataset, cluster, umap, onNew, onChange}) {
const data = new FormData(form)
const samples = data.get('samples')
const min_samples = data.get('min_samples')
startClusterJob({umap_id: umap.id, samples, min_samples})
const cluster_selection_epsilon = data.get('cluster_selection_epsilon')
startClusterJob({umap_id: umap.id, samples, min_samples, cluster_selection_epsilon})
}, [startClusterJob, umap])


return (
<div className="dataset--clusters-new">
<div>Cluster using <a href="https://hdbscan.readthedocs.io/en/latest/api.html">HDBSCAN</a></div>
<form onSubmit={(e) => handleNewCluster(e, umap)}>
<label>
Samples:
<input type="number" name="samples" defaultValue={dataset.length < 1000 ? 5 : 25} disabled={!!clusterJob || !umap}/>
Min Cluster Size:
<input type="number" name="samples" defaultValue={dataset.length < 1000 ? 3 : 25} disabled={!!clusterJob || !umap}/>
<span className="tooltip" data-tooltip-id="samples">🤔</span>
<Tooltip id="samples" place="top" effect="solid">
This parameter determines the minimum number of data points you need to make a cluster. lower values mean more clusters.
</Tooltip>
</label>
<label>
Min Samples:
<input type="number" name="min_samples" defaultValue="5" disabled={!!clusterJob || !umap} />
<input type="number" name="min_samples" defaultValue={dataset.length < 1000 ? 2 : 5} disabled={!!clusterJob || !umap} />
<span className="tooltip" data-tooltip-id="min_samples">🤔</span>
<Tooltip id="min_samples" place="top" effect="solid">
The number of samples in a neighbourhoodfor a point to be considered a core point. lower values mean more clusters.
</Tooltip>
</label>
<label>
Cluster Selection Epsilon:
<input type="number" name="cluster_selection_epsilon" defaultValue={dataset.length < 1000 ? 0.05 : 0.005} step="0.0001" disabled={!!clusterJob || !umap} />
<span className="tooltip" data-tooltip-id="cluster_selection_epsilon">🤔</span>
<Tooltip id="cluster_selection_epsilon" place="top" effect="solid">
This parameter sets a distance threshold that allows you to balance the density of clusters. Set to 0 to use pure HDBSCAN.
</Tooltip>
</label>
<button type="submit" disabled={!!clusterJob || !umap}>New Clusters</button>
</form>
Expand All @@ -105,6 +124,7 @@ function Cluster({ dataset, cluster, umap, onNew, onChange}) {
Noise points: {cl.n_noise}<br/>
Samples: {cl.samples}<br/>
Min Samples: {cl.min_samples}<br/>
{ cl.cluster_selection_epsilon ? <>Cluster Selection Epsilon: {cl.cluster_selection_epsilon} <br/></>: null }
<img src={cl.url} alt={cl.id} /><br/>
<button onClick={() => deleteClusterJob({cluster_id: cl.id}) }>🗑️</button>
</label>
Expand Down
34 changes: 30 additions & 4 deletions web/src/components/Setup/Embedding.jsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// NewEmbedding.jsx
import { useState, useEffect, useCallback } from 'react';
import { Tooltip } from 'react-tooltip'
import JobProgress from '../Job/Progress';
import { useStartJobPolling } from '../Job/Run';
const apiUrl = import.meta.env.VITE_API_URL
Expand Down Expand Up @@ -27,6 +28,7 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
const { startJob: startEmbeddingsJob } = useStartJobPolling(dataset, setEmbeddingsJob, `${apiUrl}/jobs/embed`);
const { startJob: deleteEmbeddingsJob } = useStartJobPolling(dataset, setEmbeddingsJob, `${apiUrl}/jobs/delete/embedding`);
const { startJob: rerunEmbeddingsJob } = useStartJobPolling(dataset, setEmbeddingsJob, `${apiUrl}/jobs/rerun`);
const { startJob: startEmbeddingsTruncateJob } = useStartJobPolling(dataset, setEmbeddingsJob, `${apiUrl}/jobs/embed_truncate`);

const [models, setModels] = useState([]);
useEffect(() => {
Expand Down Expand Up @@ -100,6 +102,12 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
setDimensions(+e.target.value)
}

const handleTruncate = useCallback((embeddingId) => {
const selectedDimension = document.getElementById(`truncate-${embeddingId}`).value;
console.log("truncating", embeddingId, selectedDimension)
startEmbeddingsTruncateJob({embedding_id: embeddingId, dimensions: selectedDimension })
}, [startEmbeddingsTruncateJob])

return (
<div>
<div className={styles["embeddings-form"]}>
Expand All @@ -124,7 +132,12 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
setEmbeddingsJob(null)
}} rerunJob={handleRerunEmbedding} />
<div className={styles["embeddings-list"]}>
{embeddings.map((emb, index) => (
{embeddings.map((emb, index) => {
let umps = umaps.filter(d => d.embedding_id == emb.id)
let cls = clusters.filter(d => umps.map(d => d.id).indexOf(d.umap_id) >= 0)
let m = models.find(d => d.id == emb.model_id)
let dims = m.params.dimensions ? m.params.dimensions.filter(d => +d < +emb.dimensions) : []
return (
<div className="item" key={index}>
<input type="radio" id={`embedding${index}`} name="embedding" value={emb.id} checked={emb.id === embedding?.id} onChange={() => onChange(emb)} />
<label htmlFor={`embedding${index}`}>
Expand All @@ -133,14 +146,27 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
<span>Dimensions: {emb.dimensions}</span><br/>
{ emb.prefix ? <span>Prefix: {emb.prefix}<br/></span> : null }
<span>[
{umaps.filter(d => d.embedding_id == emb.id).length} umaps,&nbsp;
{clusters.filter(d => umaps.filter(d => d.embedding_id == emb.id).map(d => d.id).indexOf(d.umap_id) >= 0).length} clusters
{umps.length} umaps,&nbsp;
{cls.length} clusters
]</span>
{dims.length && <div className={styles["truncate"]}>
<select id={"truncate-"+emb.id}>
{dims.map((d,i) => {
return (<option key={"dimension-"+i} value={d}>{d}</option>)
})}
</select>
<span className="button" onClick={() => handleTruncate(emb.id)}>Truncate copy</span>
<span className="tooltip" data-tooltip-id="truncate">🤔</span>
<Tooltip id="truncate" place="top" effect="solid">
This model supports Matroyshka embeddings. You can make a truncated copy of this embedding with fewer dimensions.
</Tooltip>
</div>}
<button onClick={() => deleteEmbeddingsJob({embedding_id: emb.id}) } disabled={embeddingsJob && embeddingsJob.status !== "completed"}>🗑️</button>
</span>
</label>
</div>
))}
)}
)}
</div>
</div>
);
Expand Down
7 changes: 7 additions & 0 deletions web/src/components/Setup/Embedding.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,11 @@

.embeddings-list input, .embeddings-list label {
cursor: pointer;
}

.truncate {
display: flex;
flex-direction: row;
gap: 4px;
margin: 6px 0;
}
27 changes: 26 additions & 1 deletion web/src/components/Setup/Umap.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,19 @@ function Umap({ dataset, umap, embedding, embeddings, clusters, onNew, onChange}
const data = new FormData(form)
const neighbors = data.get('neighbors')
const min_dist = data.get('min_dist')
startUmapJob({embedding_id: embedding?.id, neighbors, min_dist, init})
const align = Array.from(document.querySelectorAll('input[name="umapAlign"]:checked'))
.map(input => input.value)
.sort((a,b) => a.localeCompare(b))
.join(",")
startUmapJob({embedding_id: embedding?.id, neighbors, min_dist, init, align})
}, [startUmapJob, embedding, init])

const [showAlign, setShowAlign] = useState(false);

const toggleShowAlign = useCallback(() => {
setShowAlign(!showAlign);
}, [showAlign, setShowAlign]);

return (
<div className="dataset--umaps-new">
<form onSubmit={handleNewUmap}>
Expand All @@ -102,6 +112,21 @@ function Umap({ dataset, umap, embedding, embeddings, clusters, onNew, onChange}
)})}
</select>
</label>
<span className="button" onClick={toggleShowAlign}>{showAlign ? 'x Align UMAPs' : '+ Align UMAPs'}</span>
{showAlign && <div className={styles["umaps-align"]}>
<span className={styles["umaps-align-info"]}>
Choose 1 or more embeddings to align alongside {embedding?.id}.
An <a href="https://umap-learn.readthedocs.io/en/latest/aligned_umap_basic_usage.html">Aligned UMAP</a> will be generated for each embedding selected.
</span>
{embeddings.map((emb, index) => {
if(emb.id == embedding?.id) return null
return (<label key={index}>
<input type="checkbox" id={`umap-align-${emb.id}`} name="umapAlign" value={emb.id} />
{emb.id} - {emb.model_id} [{emb.dimensions}]
</label>
)}
)}
</div>}
<button type="submit" disabled={!!umapJob}>New UMAP</button>
</form>
<JobProgress job={umapJob} clearJob={()=> setUmapJob(null)}/>
Expand Down
20 changes: 19 additions & 1 deletion web/src/components/Setup/Umap.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,22 @@
width: 120px;
height: 120px;
user-select: none;
}
}

.umaps-align {
display: flex;
flex-direction: column;
gap: 6px;
}
.umaps-align-info {
font-style: italic;
margin: 6px;
}
.umaps-align label {
cursor: pointer;
padding: 4px;
border: 1px solid #f1f1f1;
background-color: #f1f1f1;
border-radius: 5px;
}

Loading

0 comments on commit deb1b8f

Please sign in to comment.