From 89057ba722570163416ef833724de557ae6b9203 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Wed, 13 Mar 2024 13:56:56 -0400 Subject: [PATCH] create export page for saving data. enhance scope datastructure for export. address #32 --- latentscope/scripts/scope.py | 52 +++++++++++++- latentscope/server/datasets.py | 50 ++++++------- latentscope/server/jobs.py | 20 ++++++ web/src/App.jsx | 3 + web/src/components/Home.jsx | 1 + web/src/components/Setup/Scope.jsx | 86 +++++++++++++++-------- web/src/pages/Export.jsx | 108 +++++++++++++++++++++++++++++ web/src/pages/Export.module.css | 57 +++++++++++++++ web/src/pages/Setup.jsx | 2 + 9 files changed, 319 insertions(+), 60 deletions(-) create mode 100644 web/src/pages/Export.jsx create mode 100644 web/src/pages/Export.module.css diff --git a/latentscope/scripts/scope.py b/latentscope/scripts/scope.py index 9105989..2b55aa3 100644 --- a/latentscope/scripts/scope.py +++ b/latentscope/scripts/scope.py @@ -2,6 +2,7 @@ import re import json import argparse +import pandas as pd from latentscope.util import get_data_dir @@ -14,8 +15,12 @@ def main(): parser.add_argument('cluster_labels_id', type=str, help='Cluster labels id') parser.add_argument('label', type=str, help='Label for the scope') parser.add_argument('description', type=str, help='Description of the scope') + parser.add_argument('--scope_id', type=str, help='Scope id to overwrite existing scope', default=None) -def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description): + args = parser.parse_args() + scope(**vars(args)) + +def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description, scope_id=None): DATA_DIR = get_data_dir() print("DATA DIR", DATA_DIR) directory = os.path.join(DATA_DIR, dataset_id, "scopes") @@ -33,7 +38,11 @@ def get_next_scopes_number(dataset): next_scopes_number = get_next_scopes_number(dataset_id) # make the umap name from the number, zero padded to 3 digits - id = f"scopes-{next_scopes_number:03d}" + if not scope_id: + id = f"scopes-{next_scopes_number:03d}" + else: + id = scope_id + print("RUNNING:", id) scope = { @@ -45,6 +54,45 @@ def get_next_scopes_number(dataset): "label": label, "description": description } + + # read each json file and add its contents to the scope file + embedding_file = os.path.join(DATA_DIR, dataset_id, "embeddings", embedding_id + ".json") + with open(embedding_file) as f: + embedding = json.load(f) + scope["embedding"] = embedding + + umap_file = os.path.join(DATA_DIR, dataset_id, "umaps", umap_id + ".json") + with open(umap_file) as f: + umap = json.load(f) + scope["umap"] = umap + + cluster_file = os.path.join(DATA_DIR, dataset_id, "clusters", cluster_id + ".json") + with open(cluster_file) as f: + cluster = json.load(f) + scope["cluster"] = cluster + + if cluster_labels_id == "default": + cluster_labels_id = cluster_id + "-labels-default" + scope["cluster_labels"] = {"id": cluster_labels_id, "cluster_id": cluster_id} + else: + cluster_labels_file = os.path.join(DATA_DIR, dataset_id, "clusters", cluster_labels_id + ".json") + with open(cluster_labels_file) as f: + cluster_labels = json.load(f) + scope["cluster_labels"] = cluster_labels + + # create a scope parquet by combining the parquets from umap and cluster, as well as getting the labels from cluster_labels + # then write the parquet to the scopes directory + umap_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "umaps", umap_id + ".parquet")) + cluster_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "clusters", cluster_id + ".parquet")) + cluster_labels_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "clusters", cluster_labels_id + ".parquet")) + # create a column where we lookup the label from cluster_labels_df for the index found in the cluster_df + cluster_df["label"] = cluster_df["cluster"].apply(lambda x: cluster_labels_df.loc[x]["label"]) + scope_parquet = pd.concat([umap_df, cluster_df], axis=1) + scope_parquet.to_parquet(os.path.join(directory, id + ".parquet")) + + scope["rows"] = len(scope_parquet) + scope["columns"] = scope_parquet.columns.tolist() + scope["size"] = os.path.getsize(os.path.join(directory, id + ".parquet")) file_path = os.path.join(directory, id + ".json") with open(file_path, 'w') as f: diff --git a/latentscope/server/datasets.py b/latentscope/server/datasets.py index c9bc980..fa1d1e2 100644 --- a/latentscope/server/datasets.py +++ b/latentscope/server/datasets.py @@ -197,31 +197,25 @@ def get_dataset_scope(dataset, scope): json_contents = json.load(json_file) return jsonify(json_contents) -@datasets_write_bp.route('//scopes/save', methods=['POST']) -def save_dataset_scope(dataset): - if not request.json: - return jsonify({"error": "Invalid data format, JSON expected"}), 400 - id = request.json.get('id') - embedding_id = request.json.get('embedding_id') - umap_id = request.json.get('umap_id') - cluster_id = request.json.get('cluster_id') - cluster_labels_id = request.json.get('cluster_labels_id') - label = request.json.get('label') - description = request.json.get('description') - scope = { - "embedding_id": embedding_id, - "umap_id": umap_id, - "cluster_id": cluster_id, - "cluster_labels_id": cluster_labels_id, - "label": label, - "description": description - } - if not id: - next_scopes_number = get_next_scopes_number(dataset) - # make the umap name from the number, zero padded to 3 digits - id = f"scopes-{next_scopes_number:03d}" - scope["id"] = id - file_path = os.path.join(DATA_DIR, dataset, "scopes", id + ".json") - with open(file_path, 'w') as f: - json.dump(scope, f, indent=2) - return jsonify(scope) +@datasets_bp.route('//export/list', methods=['GET']) +def get_dataset_export_list(dataset): + directory_path = os.path.join(DATA_DIR, dataset) + print("dataset", dataset, directory_path) + # scan the directory for files and directories + # then walk the directories to find all the files + # then return the list of files + file_list = [] + for root, dirs, files in os.walk(directory_path): + if "jobs" in root: + continue + for file in files: + if file == ".DS_Store": + continue + full_path = os.path.join(root, file) + file_name = os.path.basename(full_path) + relative_path = os.path.relpath(full_path, directory_path) + directory = os.path.relpath(root, directory_path) + size = os.path.getsize(full_path) + file_list.append((file_name, directory, relative_path, full_path, size)) + + return jsonify(file_list) \ No newline at end of file diff --git a/latentscope/server/jobs.py b/latentscope/server/jobs.py index e84123b..7fb5af1 100644 --- a/latentscope/server/jobs.py +++ b/latentscope/server/jobs.py @@ -298,3 +298,23 @@ def run_cluster_label(): command = f'ls-label {dataset} "{text_column}" {cluster_id} {chat_id} "{context}"' threading.Thread(target=run_job, args=(dataset, job_id, command)).start() return jsonify({"job_id": job_id}) + +@jobs_write_bp.route('/scope') +def run_scope(): + dataset = request.args.get('dataset') + embedding_id = request.args.get('embedding_id') + umap_id = request.args.get('umap_id') + cluster_id = request.args.get('cluster_id') + cluster_labels_id = request.args.get('cluster_labels_id') + label = request.args.get('label') + description = request.args.get('description') + scope_id = request.args.get('scope_id') + print("run scope", dataset, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description, scope_id) + + job_id = str(uuid.uuid4()) + command = f'ls-scope {dataset} {embedding_id} {umap_id} {cluster_id} {cluster_labels_id} "{label}" "{description}"' + if scope_id: + command += f' --scope_id={scope_id}' + threading.Thread(target=run_job, args=(dataset, job_id, command)).start() + return jsonify({"job_id": job_id}) + diff --git a/web/src/App.jsx b/web/src/App.jsx index 0536472..3e1f11f 100644 --- a/web/src/App.jsx +++ b/web/src/App.jsx @@ -7,6 +7,7 @@ import Compare from './pages/Compare'; import Setup from './pages/Setup'; import Jobs from './pages/Jobs'; import Job from './pages/Job'; +import Export from './pages/Export'; import Nav from './components/Nav'; import './App.css'; @@ -28,6 +29,8 @@ function App() { } /> } /> } /> + } /> + } /> {readonly ? null : } />} {readonly ? null : } />} diff --git a/web/src/components/Home.jsx b/web/src/components/Home.jsx index b17aeb8..6aa495d 100644 --- a/web/src/components/Home.jsx +++ b/web/src/components/Home.jsx @@ -134,6 +134,7 @@ function Home() { {scope.description}
{readonly ? null : Configure } + {readonly ? null : <> | Export } ))} diff --git a/web/src/components/Setup/Scope.jsx b/web/src/components/Setup/Scope.jsx index 9f1b471..a6c0eab 100644 --- a/web/src/components/Setup/Scope.jsx +++ b/web/src/components/Setup/Scope.jsx @@ -1,6 +1,9 @@ // NewEmbedding.jsx import { useState, useEffect, useCallback} from 'react'; import { Link, useNavigate } from 'react-router-dom'; +import { useStartJobPolling } from '../Job/Run'; +import JobProgress from '../Job/Progress'; + const apiUrl = import.meta.env.VITE_API_URL @@ -19,21 +22,37 @@ Scope.propTypes = { }; function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew, onChange}) { - // const[scopes, setScopes] = useState([]); const navigate = useNavigate(); + const [scopeJob, setScopeJob] = useState(null); + const { startJob: startScopeJob} = useStartJobPolling(dataset, setScopeJob, `${apiUrl}/jobs/scope`); + useEffect(() => { if(dataset) { console.log("fetching scopes") - fetch(`${apiUrl}/datasets/${dataset.id}/scopes`) + fetchScopes(dataset.id, onNew) + } + }, [dataset]); + + function fetchScopes(datasetId, onNew) { + fetch(`${apiUrl}/datasets/${datasetId}/scopes`) .then(response => response.json()) .then(data => { const sorted = data.sort((a,b) => a.id.localeCompare(b.id)) - // setScopes(sorted) onNew(sorted) }); + } + + useEffect(() => { + if(scopeJob?.status == "completed") { + fetchScopes(dataset.id, (scopes) => { + setScopeJob(null) + onNew(scopes) + // onChange(scopes.find(d => d.id == scopeJob.run_id)) + navigate(`/datasets/${dataset.id}/setup/${scopeJob.run_id}`); + }) } - }, [dataset]); + }, [scopeJob, dataset, navigate, onNew, onChange]); const handleSaveScope = useCallback((event) => { @@ -53,32 +72,33 @@ function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew const action = data.get('action') console.log("action", action) if(action == "save") { - payload.id = scope.id + payload.scope_id = scope.id } + startScopeJob(payload) - fetch(`${apiUrl}/datasets/${dataset.id}/scopes/save`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify(payload) - }) - .then(response => response.json()) - .then(data => { - const tscope = data - fetch(`${apiUrl}/datasets/${dataset.id}/scopes`) - .then(response => response.json()) - .then(data => { - // setScopes(data) - onNew(data) - onChange(data.find(s => s.id == tscope.id)) - }); - navigate(`/datasets/${dataset.id}/setup/${data.id}`); - }) - .catch(error => { - console.error('Error saving scope:', error); - }); - }, [dataset, scope, cluster, clusterLabelId, umap, embedding , navigate, onChange, onNew]); + // fetch(`${apiUrl}/datasets/${dataset.id}/scopes/save`, { + // method: 'POST', + // headers: { + // 'Content-Type': 'application/json' + // }, + // body: JSON.stringify(payload) + // }) + // .then(response => response.json()) + // .then(data => { + // const tscope = data + // fetch(`${apiUrl}/datasets/${dataset.id}/scopes`) + // .then(response => response.json()) + // .then(data => { + // // setScopes(data) + // onNew(data) + // onChange(data.find(s => s.id == tscope.id)) + // }); + // navigate(`/datasets/${dataset.id}/setup/${data.id}`); + // }) + // .catch(error => { + // console.error('Error saving scope:', error); + // }); + }, [dataset, scope, cluster, clusterLabelId, umap, embedding]); const [isDifferent, setIsDifferent] = useState(false); useEffect(() => { @@ -134,15 +154,21 @@ function Scope({ dataset, scope, umap, embedding, cluster, clusterLabelId, onNew Labels: { scope.cluster_labels_id }
: null } - {scope && isDifferent ? + + setScopeJob(null)} /> + + {scope && !scopeJob ? : null } - { isDifferent ? : null } + + { scope ? Explore {scope.label} ({scope.id})
: null } + { scope ? Export data ({scope.id})
: null } diff --git a/web/src/pages/Export.jsx b/web/src/pages/Export.jsx new file mode 100644 index 0000000..fa9002d --- /dev/null +++ b/web/src/pages/Export.jsx @@ -0,0 +1,108 @@ +import { useEffect, useState, useMemo, useCallback } from 'react'; +import { useParams, Link } from 'react-router-dom'; + +const apiUrl = import.meta.env.VITE_API_URL +const readonly = import.meta.env.MODE == "read_only" + +import styles from './Export.module.css'; + +function niceBytes(bytes) { + const units = ["B", "KB", "MB", "GB"] + let i = 0; + while (bytes > 1024) { + bytes = bytes / 1024; + i++; + } + return `${bytes.toFixed(0)}${units[i]}`; +} + +function Export() { + const [dataset, setDataset] = useState(null); + const { dataset: datasetId, scope: scopeId } = useParams(); + + useEffect(() => { + fetch(`${apiUrl}/datasets/${datasetId}/meta`) + .then(response => response.json()) + .then(setDataset) + .catch(console.error); + }, [datasetId, setDataset]); + + const [scope, setScope] = useState(null); + useEffect(() => { + fetch(`${apiUrl}/datasets/${datasetId}/scopes/${scopeId}`) + .then(response => response.json()) + .then(data => { + console.log("scope", data) + setScope(data) + }) + .catch(console.error); + }, [datasetId, scopeId, setScope]); + + + const [datasetFiles, setDatasetFiles] = useState([]); + useEffect(() => { + fetch(`${apiUrl}/datasets/${datasetId}/export/list`) + .then(response => response.json()) + .then(data => { + console.log("export list", data) + setDatasetFiles(data) + }) + .catch(console.error); + }, [datasetId]); + + + const fileLink = useCallback((d,i) => { + return
  • + {d[0]} + {niceBytes(d[4])} + {d[3]} +
  • + }, [datasetId]) + + + return ( +
    +
    +

    Export Data for {dataset?.id} {scopeId}

    + Setup {dataset?.id} {scopeId} + {scopeId ? Explore {dataset?.id} {scopeId} : null } +
    +
    +

    Scope {scopeId}

    +

    These two files combine the data from each step into a single parquet (x,y from UMAP, cluster and label from clustering and labeling).

    +
      + {datasetFiles.filter(d => d[0].indexOf(scopeId) == 0).map(fileLink)} +
    +
    +
    +

    Dataset

    +
      + {datasetFiles.filter(d => d[1] == ".").map(fileLink)} +
    +

    Embeddings

    +
      + {datasetFiles.filter(d => d[1] == "embeddings").map(fileLink)} +
    +

    Umaps

    +
      + {datasetFiles.filter(d => d[1] == "umaps").map(fileLink)} +
    +

    Clusters

    +
      + {datasetFiles.filter(d => d[1] == "clusters").map(fileLink)} +
    +

    Scopes

    +
      + {datasetFiles.filter(d => d[1] == "scopes").map(fileLink)} +
    +

    tags

    +
      + {datasetFiles.filter(d => d[1] == "tags").map(fileLink)} +
    +
    + +
    + ); +} + +export default Export; diff --git a/web/src/pages/Export.module.css b/web/src/pages/Export.module.css new file mode 100644 index 0000000..00da261 --- /dev/null +++ b/web/src/pages/Export.module.css @@ -0,0 +1,57 @@ +.page { + +} +.header { + display: flex; + flex-direction: column; + box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.15); + margin: 12px; + padding: 6px; +} +.header a { + padding: 6px; + display: inline-block; + width: fit-content + +} + +.scope-files { + box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.15); + margin: 12px; + padding: 6px; +} + +.dataset-files { + box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.15); + margin: 12px; + padding: 6px; + +} + +.page h3 { + font-size: 1.2em; + font-weight: bold; + margin: 0; + padding: 6px 12px; +} +.page ul { + list-style: none; + margin: 0; + padding: 0px 24px; + padding-bottom: 12px; +} + +.description { + padding: 0 12px; +} + +.path { + font-family: 'Courier New', monospace; + margin-left: 6px; + font-size: 0.7em; +} +.size { + font-family: 'Courier New', monospace; + margin-left: 6px; + font-size: 0.7em; +} \ No newline at end of file diff --git a/web/src/pages/Setup.jsx b/web/src/pages/Setup.jsx index db4e254..b4e902a 100644 --- a/web/src/pages/Setup.jsx +++ b/web/src/pages/Setup.jsx @@ -471,6 +471,8 @@ function Setup() {
    { scope ? Explore {scope.label} ({scope.id})
    : null } + { scope ? Export data ({scope.id})
    + : Export data
    } { umaps && umaps.length > 1 ? Compare UMAPs
    : null } {dataset.length} rows
    Columns: {dataset.columns.join(", ")}