Skip to content

Commit

Permalink
many styling fixes. cluster label improvements, job ux. update cluste…
Browse files Browse the repository at this point in the history
…r labels manually on explore. mobile warning page
  • Loading branch information
enjalot committed Feb 14, 2024
1 parent a4a8839 commit 1c927fd
Show file tree
Hide file tree
Showing 20 changed files with 351 additions and 137 deletions.
46 changes: 13 additions & 33 deletions latentscope/models/chat_models.json
Original file line number Diff line number Diff line change
@@ -1,25 +1,7 @@
[
{
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"params": {
"max_tokens": 2048
}
},
{
"provider": "transformers",
"name": "HuggingFaceH4/zephyr-7b-beta",
"sanitized_name": "HuggingFaceH4___zephyr-7b-beta",
"id": "transformers-HuggingFaceH4___zephyr-7b-beta",
"params": {
"max_tokens": 4096
}
},
{
"provider": "openai",
"name": "gpt-3.5-turbo",
"name": "gpt-3.5-turbo-0125",
"id": "openai-gpt-3.5-turbo",
"params": {
"max_tokens": 4096
Expand Down Expand Up @@ -58,24 +40,22 @@
}
},
{
"provider": "cohereai",
"name": "embed-english-v3.0",
"id": "cohereai-embed-english-v3.0",
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"params": {
"input_type": "clustering"
"max_tokens": 2048
}
},
{
"provider": "togetherai",
"name": "togethercomputer/m2-bert-80M-2k-retrieval",
"sanitized_name": "togethercomputer___m2-bert-80M-2k-retrieval",
"id": "togetherai-togethercomputer___m2-bert-80M-2k-retrieval",
"modality": "text",
"params": {
"truncation": true,
"padding": true,
"max_tokens": 2048
}
"provider": "transformers",
"name": "HuggingFaceH4/zephyr-7b-beta",
"sanitized_name": "HuggingFaceH4___zephyr-7b-beta",
"id": "transformers-HuggingFaceH4___zephyr-7b-beta",
"params": {
"max_tokens": 4096
}
}

]
2 changes: 1 addition & 1 deletion latentscope/scripts/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def embed(dataset_id, text_column, model_id, prefix, rerun):

for i, batch in enumerate(tqdm(chunked_iterable(sentences, batch_size), total=total_batches)):
if i < starting_batch:
print(f"skipping batch {i}/{total_batches}")
print(f"skipping batch {i}/{total_batches}", flush=True)
continue
try:
embeddings = np.array(model.embed(batch))
Expand Down
8 changes: 4 additions & 4 deletions latentscope/scripts/label_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
if rerun is not None:
label_id = rerun
clusters = pd.read_parquet(os.path.join(cluster_dir, f"{label_id}.parquet"))
print(clusters.columns)
# print(clusters.columns)
# find the first row where labeled isnt True
unlabeled_row = clusters[~clusters['labeled']].first_valid_index()
print(f"First unlabeled row: {unlabeled_row}")
Expand Down Expand Up @@ -124,7 +124,7 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
# print(batch[0])
if(unlabeled_row > 0):
if clusters.loc[i, 'labeled']:
print("skipping", i, "already labeled", clusters.loc[i, 'label'])
print("skipping", i, "already labeled", clusters.loc[i, 'label'], flush=True)
time.sleep(0.01)
continue

Expand All @@ -135,7 +135,7 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
]
label = model.chat(messages)
labels.append(label)
print("label:\n", label)
# print("label:\n", label)
# do some cleanup of the labels when the model doesn't follow instructions
clean_label = label.replace("\n", " ")
clean_label = clean_label.replace('"', '')
Expand All @@ -145,7 +145,7 @@ def labeler(dataset_id, text_column="text", cluster_id="cluster-001", model_id="
clean_label = " ".join(clean_label.split(" ")[0:5])
clean_labels.append(clean_label)

print("clean_label:\n", clean_label)
# print("clean_label:\n", clean_label)
clusters.loc[i, 'label'] = clean_label
clusters.loc[i, 'label_raw'] = label
clusters.loc[i, 'labeled'] = True
Expand Down
2 changes: 1 addition & 1 deletion latentscope/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def check_read_only(s):
return s.lower() in ['true', '1', 't', 'y', 'yes']
# export LATENT_SCOPE_READ_ONLY=1
READ_ONLY = check_read_only(os.getenv("LATENT_SCOPE_READ_ONLY"))
print("READ ONLY?", READ_ONLY, not READ_ONLY)
print("READ ONLY?", READ_ONLY)

# in memory cache of dataframes loaded for each dataset
# used in returning rows for a given index (indexed, get_tags)
Expand Down
26 changes: 26 additions & 0 deletions latentscope/server/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def get_datasets():
jsonData['id'] = dir
datasets.append(jsonData)

datasets.sort(key=lambda x: x.get('length'))
return jsonify(datasets)

"""
Expand Down Expand Up @@ -142,8 +143,33 @@ def get_dataset_cluster_labels(dataset, cluster, model):
file_name = cluster + "-labels-" + model + ".parquet"
file_path = os.path.join(DATA_DIR, dataset, "clusters", file_name)
df = pd.read_parquet(file_path)
df.reset_index(inplace=True)
return df.to_json(orient="records")

@datasets_write_bp.route('/<dataset>/clusters/<cluster>/labels/<model>/label/<index>', methods=['GET'])
def overwrite_dataset_cluster_label(dataset, cluster, model, index):
index = int(index)
new_label = request.args.get('label')
print("write label", index, new_label)
if new_label is None:
return jsonify({"error": "Missing 'label' in request data"}), 400

file_name = cluster + "-labels-" + model + ".parquet"
file_path = os.path.join(DATA_DIR, dataset, "clusters", file_name)
try:
df = pd.read_parquet(file_path)
except FileNotFoundError:
return jsonify({"error": "File not found"}), 404

if index >= len(df):
return jsonify({"error": "Index out of range"}), 400

df.at[index, 'label'] = new_label
df.to_parquet(file_path)

return jsonify({"success": True, "message": "Label updated successfully"})


@datasets_bp.route('/<dataset>/clusters/<cluster>/labels_available', methods=['GET'])
def get_dataset_cluster_labels_available(dataset, cluster):
directory_path = os.path.join(DATA_DIR, dataset, "clusters")
Expand Down
16 changes: 16 additions & 0 deletions latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
jobs_write_bp = Blueprint('jobs_write_bp', __name__)
DATA_DIR = os.getenv('LATENT_SCOPE_DATA')

TIMEOUT = 60 * 5 # 5 minute timeout TODO: make this a config option

PROCESSES = {}

def run_job(dataset, job_id, command):
Expand Down Expand Up @@ -42,8 +44,14 @@ def run_job(dataset, job_id, command):
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, shell=True)
PROCESSES[job_id] = process

last_output_time = time.time() # Initialize with the current time

while True:
output = process.stdout.readline()
current_time = time.time() # Update current time on each iteration
print(current_time, current_time - last_output_time, TIMEOUT)
print("output", output)

if output == '' and process.poll() is not None:
break
if output:
Expand All @@ -57,6 +65,14 @@ def run_job(dataset, job_id, command):
job["last_update"] = str(datetime.now())
with open(progress_file, 'w') as f:
json.dump(job, f)
last_output_time = current_time
elif current_time - last_output_time > TIMEOUT:
print(f"Timeout: No output for more than {TIMEOUT} seconds.")
print("OUTPUT", output)
job["progress"].append(output.strip())
job["progress"].append(f"Timeout: No output for more than {TIMEOUT} seconds.")
job["status"] = "error"
break # Break the loop

if process.returncode != 0:
job["status"] = "error"
Expand Down
9 changes: 7 additions & 2 deletions web/src/App.jsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { HashRouter as Router, Routes, Route } from 'react-router-dom';
import Mobile from './pages/Mobile';
import Home from './components/Home';
import Explore from './pages/Explore';
import Setup from './pages/Setup';
Expand All @@ -11,18 +12,22 @@ const env = import.meta.env;
console.log("ENV", env)
const readonly = import.meta.env.MODE == "read_only"

const isMobileDevice = () => {
return /Android|webOS|iPhone|iPad|iPod|BlackBerry|IEMobile|Opera Mini/i.test(navigator.userAgent);
};

function App() {
return (
<Router basename={env.BASE_NAME}>
<Nav />
<div className="page">
<Routes>
<Route path="/" element={<Home />} />
<Route path="/" element={isMobileDevice() ? <Mobile/> : <Home />} />
<Route path="/datasets/:dataset/explore/:scope" element={isMobileDevice() ? <Mobile/> : <Explore />} />
{readonly ? null : <Route path="/datasets/:dataset/setup" element={<Setup/>} />}
{readonly ? null : <Route path="/datasets/:dataset/setup/:scope" element={<Setup/>} />}
{readonly ? null : <Route path="/datasets/:dataset/jobs" element={<Jobs />} />}
{readonly ? null : <Route path="/datasets/:dataset/jobs/:job" element={<Job />} />}
<Route path="/datasets/:dataset/explore/:scope" element={<Explore />} />
</Routes>
</div>
</Router>
Expand Down
1 change: 1 addition & 0 deletions web/src/components/AnnotationPlot.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const AnnotationPlot = ({
let rw = zScale(size)
if(!points.length) return
points.map(point => {
if(!point) return;
if(fill)
ctx.fillRect(xScale(point[0]) - rw/2, yScale(point[1]) - rw/2, rw, rw)
if(stroke)
Expand Down
18 changes: 13 additions & 5 deletions web/src/components/Job/Progress.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ import './Progress.css';
JobProgress.propTypes = {
job: PropTypes.object,
onlyLast: PropTypes.bool,
clearJob: PropTypes.func.isRequired,
clearJob: PropTypes.func,
rerunJob: PropTypes.func,
killJob: PropTypes.func,
};

function JobProgress({job, onlyLast, clearJob, rerunJob}) {
function JobProgress({job, onlyLast, clearJob, rerunJob, killJob}) {
const preRef = useRef(null);

useEffect(() => {
Expand All @@ -19,6 +20,8 @@ function JobProgress({job, onlyLast, clearJob, rerunJob}) {
}
}, [job]);

const secondsSinceLastUpdate = Math.round((+new Date() - +new Date(job?.last_update)) / 1000)

return (
<>
{ job ? <div className='job-progress'>
Expand All @@ -30,13 +33,18 @@ function JobProgress({job, onlyLast, clearJob, rerunJob}) {
job.progress.join("\n")
}
</pre>
{job.status == "completed" ? <button onClick={clearJob}>👍</button> : null }
{clearJob && job.status == "completed" ? <button onClick={clearJob}>👍 Dismiss</button> : null }
{killJob && job.status == "running" ? <button onClick={() => {killJob(job)}}>💀 Kill</button> : null}
{job.status == "error" ?
<div className="error-choices">
<button onClick={clearJob}>🤬</button>
{rerunJob ? <button onClick={() => rerunJob(job)}>Rerun</button> : null }
{clearJob ? <button onClick={clearJob}>🤬 Dismiss</button> : null }
{rerunJob ? <button onClick={() => rerunJob(job)}>🔁 Rerun</button> : null }
</div>
: null }
<span className="timer">
{secondsSinceLastUpdate} seconds since last update

</span>
</div>
: <></> }
</>
Expand Down
6 changes: 3 additions & 3 deletions web/src/components/Job/Run.jsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { useState, useEffect, useCallback } from 'react';
const apiUrl = import.meta.env.VITE_API_URL

function jobPolling(dataset, setJob, jobId, intervalms = 200) {
function jobPolling(dataset, setJob, jobId, intervalms = 500) {
let intervalId = null
console.log("start polling", jobId);
intervalId = setInterval(() => {
Expand Down Expand Up @@ -31,7 +31,7 @@ function jobPolling(dataset, setJob, jobId, intervalms = 200) {
setJob(null)
// TODO: have some kind of error state persist
});
}, 200);
}, intervalms);
console.log("returning jobPolling cleanup")
return () => {
console.log("inside cleanup", intervalId)
Expand All @@ -41,7 +41,7 @@ function jobPolling(dataset, setJob, jobId, intervalms = 200) {
}
}

function useStartJobPolling(dataset, setJob, url, intervalms = 200) {
function useStartJobPolling(dataset, setJob, url, intervalms = 500) {
// const [cleanup, setCleanup] = useState(() => {})
const startJob = useCallback((params) => {
fetch(`${url}?dataset=${dataset.id}&${new URLSearchParams(params)}`)
Expand Down
28 changes: 18 additions & 10 deletions web/src/components/Setup/ClusterLabels.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ ClusterLabels.propTypes = {
onLabels: PropTypes.func,
onLabelIds: PropTypes.func,
onHoverLabel: PropTypes.func,
onClickLabel: PropTypes.func,
};

// This component is responsible for the embeddings state
// New embeddings update the list
function ClusterLabels({ dataset, cluster, selectedLabelId, onChange, onLabels, onLabelIds, onHoverLabel}) {
function ClusterLabels({ dataset, cluster, selectedLabelId, onChange, onLabels, onLabelIds, onHoverLabel, onClickLabel}) {
const [clusterLabelsJob, setClusterLabelsJob] = useState(null);
const { startJob: startClusterLabelsJob } = useStartJobPolling(dataset, setClusterLabelsJob, `${apiUrl}/jobs/cluster_label`);

Expand All @@ -43,6 +44,7 @@ function ClusterLabels({ dataset, cluster, selectedLabelId, onChange, onLabels,
// the actual labels for the given cluster
const [clusterLabels, setClusterLabels] = useState([]);
useEffect(() => {
console.log("in cluster labels", dataset, cluster, selectedLabelId)
if(dataset && cluster && selectedLabelId) {
fetch(`${apiUrl}/datasets/${dataset.id}/clusters/${cluster.id}/labels/${selectedLabelId}`)
.then(response => response.json())
Expand All @@ -62,11 +64,13 @@ function ClusterLabels({ dataset, cluster, selectedLabelId, onChange, onLabels,
fetch(`${apiUrl}/datasets/${dataset.id}/clusters/${cluster.id}/labels_available`)
.then(response => response.json())
.then(data => {
console.log("cluster changed, set label models fetched", cluster.id, data)
if(clusterLabelsJob && clusterLabelsJob.status == "completed") {
console.log("cluster changed, set label models fetched", cluster.id, data, clusterLabelsJob)
if(clusterLabelsJob) {
let lbl;
if(clusterLabelsJob?.job_name == "cluster_label"){
lbl = data.find(d => d.id == clusterLabelsJob.run_id)
if(clusterLabelsJob?.job_name == "label"){
let label_id = clusterLabelsJob.run_id.split("-")[3]
lbl = data.find(d => d == label_id)
console.log("label_id", label_id, lbl)
} else if(clusterLabelsJob.job_name == "rm") {
lbl = data[0]
}
Expand Down Expand Up @@ -123,24 +127,28 @@ function ClusterLabels({ dataset, cluster, selectedLabelId, onChange, onLabels,
<button type="submit" disabled={!!clusterLabelsJob || !cluster}>Auto Label</button>
</form>

<JobProgress job={clusterLabelsJob} clearJob={()=>setClusterLabelsJob(null)} />
<JobProgress job={clusterLabelsJob} clearJob={()=>setClusterLabelsJob(null)} killJob={setClusterLabelsJob} />

</div>
{cluster ? <div className="dataset--setup-cluster-labels-list">
<label>
View Labels:
<select
Use Labels: &nbsp;
{clusterLabelModels.length > 1 ? <select
name="model"
value={selectedLabelId}
onChange={(e) => onChange(e.target.value)}
>
{clusterLabelModels.map((model, index) => (
<option key={index} value={model}>{model}</option>
))}
</select>
</select> : <span>{clusterLabelModels[0]}</span> }
</label>
<div className="dataset--setup-labels-list">
<DataTable data={clusterLabels.map((d,i) => ({index: i, label: d.label, items: d.indices.length}))} onHover={(index) => onHoverLabel(clusterLabels[index])}/>
<DataTable
data={clusterLabels.map((d,i) => ({index: i, label: d.label, items: d.indices.length}))}
onHover={(index) => onHoverLabel(clusterLabels[index])}
onClick={(index) => onClickLabel(clusterLabels[index])}
/>
</div>
</div> : null}
</div>
Expand Down
Loading

0 comments on commit 1c927fd

Please sign in to comment.