-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #36 from MikeLippincott/rerun_cytotable_merge
Rerun cytotable merge
- Loading branch information
Showing
30 changed files
with
2,688 additions
and
2,544 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
254 changes: 254 additions & 0 deletions
254
Plate_2_data/4.processing_features/0.merge_sc_plate2_PBMC.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Merge single cells from CellProfiler outputs using CytoTable" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"import pathlib\n", | ||
"\n", | ||
"from cytotable import convert, presets\n", | ||
"\n", | ||
"sys.path.append(\"../../utils\")\n", | ||
"import sc_extraction_utils as sc_utils\n", | ||
"from parsl.config import Config\n", | ||
"from parsl.executors import HighThroughputExecutor" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Set paths and variables\n", | ||
"\n", | ||
"All paths must be string but we use pathlib to show which variables are paths" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# type of file output from CytoTable (currently only parquet)\n", | ||
"dest_datatype = \"parquet\"\n", | ||
"\n", | ||
"# directory where parquet files are saved to\n", | ||
"output_dir = pathlib.Path(\"./data/converted_data\")\n", | ||
"output_dir.mkdir(exist_ok=True, parents=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# preset configurations based on typical CellProfiler outputs\n", | ||
"preset = \"cellprofiler_sqlite_pycytominer\"\n", | ||
"# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names\n", | ||
"# add Image_Metadata_Site as this is an important metadata when finding where single cells are located\n", | ||
"presets.config[\"cellprofiler_sqlite_pycytominer\"][\n", | ||
" \"CONFIG_JOINS\"\n", | ||
" # create filtered list of image features to be extracted and used for merging tables\n", | ||
" # with the list of image features, this will merge the objects together using the image number,\n", | ||
" # and parent objects to create all single cells (all objects associated to one cell)\n", | ||
"] = \"\"\"WITH Per_Image_Filtered AS (\n", | ||
" SELECT\n", | ||
" Metadata_ImageNumber,\n", | ||
" Image_Metadata_Plate,\n", | ||
" Image_Metadata_Well,\n", | ||
" Image_Metadata_Site\n", | ||
" FROM\n", | ||
" read_parquet('per_image.parquet')\n", | ||
" )\n", | ||
" SELECT\n", | ||
" *\n", | ||
" FROM\n", | ||
" Per_Image_Filtered AS per_image\n", | ||
" LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON\n", | ||
" per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber\n", | ||
" LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON\n", | ||
" per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n", | ||
" AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells\n", | ||
" LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON\n", | ||
" per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n", | ||
" AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei\n", | ||
" \"\"\"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "FileNotFoundError", | ||
"evalue": "[Errno 2] No such file or directory: '/projects'", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# set directory for sqlite files\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m sqlite_dir \u001b[38;5;241m=\u001b[39m \u001b[43mpathlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/projects/[email protected]/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 4\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# dictionary with info for the sqlite file from each run\u001b[39;00m\n\u001b[1;32m 7\u001b[0m sqlite_info_dictionary \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_1\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# path to outputted SQLite file\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 70\u001b[0m } \n\u001b[1;32m 71\u001b[0m }\n", | ||
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:1181\u001b[0m, in \u001b[0;36mPath.resolve\u001b[0;34m(self, strict)\u001b[0m\n\u001b[1;32m 1179\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_closed:\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_closed()\n\u001b[0;32m-> 1181\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flavour\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;66;03m# No symlink resolution => for consistency, raise an error if\u001b[39;00m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;66;03m# the path doesn't exist or is forbidden\u001b[39;00m\n\u001b[1;32m 1185\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstat()\n", | ||
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:363\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve\u001b[0;34m(self, path, strict)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[38;5;66;03m# NOTE: according to POSIX, getcwd() cannot contain path components\u001b[39;00m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;66;03m# which are symlinks.\u001b[39;00m\n\u001b[1;32m 362\u001b[0m base \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m path\u001b[38;5;241m.\u001b[39mis_absolute() \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetcwd()\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_resolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;129;01mor\u001b[39;00m sep\n", | ||
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:347\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve.<locals>._resolve\u001b[0;34m(path, rest)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;66;03m# Resolve the symbolic link\u001b[39;00m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 347\u001b[0m target \u001b[38;5;241m=\u001b[39m \u001b[43maccessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnewpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m e\u001b[38;5;241m.\u001b[39merrno \u001b[38;5;241m!=\u001b[39m EINVAL \u001b[38;5;129;01mand\u001b[39;00m strict:\n", | ||
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:452\u001b[0m, in \u001b[0;36m_NormalAccessor.readlink\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mreadlink\u001b[39m(\u001b[38;5;28mself\u001b[39m, path):\n\u001b[0;32m--> 452\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n", | ||
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/projects'" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# set directory for sqlite files\n", | ||
"# the path below is hardcoded to a local directory\n", | ||
"# the data are large and stored on a local secondary drive\n", | ||
"# TODO: Change this path for your local setup\n", | ||
"sqlite_dir = pathlib.Path(\n", | ||
" \"/home/lippincm/Desktop/18T/interstellar_data/PBMC_SQLite_Outputs/\"\n", | ||
").resolve(strict=True)\n", | ||
"\n", | ||
"# dictionary with info for the sqlite file from each run\n", | ||
"sqlite_info_dictionary = {\n", | ||
" \"batch_1\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_1.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_1.parquet\")),\n", | ||
" },\n", | ||
" \"batch_2\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_2.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_2.parquet\")),\n", | ||
" },\n", | ||
" \"batch_3\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_3.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_3.parquet\")),\n", | ||
" },\n", | ||
" \"batch_4\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_4.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_4.parquet\")),\n", | ||
" },\n", | ||
" \"batch_5\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_5.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_5.parquet\")),\n", | ||
" },\n", | ||
" \"batch_6\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_6.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_6.parquet\")),\n", | ||
" },\n", | ||
" \"batch_7\": {\n", | ||
" # path to outputted SQLite file\n", | ||
" \"source_path\": str(\n", | ||
" pathlib.Path(\n", | ||
" f\"{sqlite_dir}/PBMC_batch_7.sqlite\"\n", | ||
" )\n", | ||
" ),\n", | ||
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_7.parquet\")),\n", | ||
" } \n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Convert SQLite file and merge single cells into parquet file\n", | ||
"\n", | ||
"This was not run to completion as we use the nbconverted python file for full run." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# run through each run with each set of paths based on dictionary\n", | ||
"for sqlite_file, info in sqlite_info_dictionary.items():\n", | ||
" source_path = info[\"source_path\"]\n", | ||
" dest_path = info[\"dest_path\"]\n", | ||
" print(f\"Performing merge single cells and conversion on {sqlite_file}!\")\n", | ||
"\n", | ||
" # merge single cells and output as parquet file\n", | ||
" convert(\n", | ||
" source_path=source_path,\n", | ||
" dest_path=dest_path,\n", | ||
" dest_datatype=dest_datatype,\n", | ||
" preset=preset,\n", | ||
" parsl_config=Config(\n", | ||
" executors=[HighThroughputExecutor()],\n", | ||
" ),\n", | ||
" chunk_size=10000,\n", | ||
" )\n", | ||
" print(f\"Merged and converted {pathlib.Path(dest_path).name}!\")\n", | ||
"\n", | ||
" # add single cell count per well as metadata column to parquet file and save back to same path\n", | ||
" sc_utils.add_sc_count_metadata_file(\n", | ||
" data_path=dest_path, well_column_name=\"Image_Metadata_Well\", file_type=\"parquet\"\n", | ||
" )\n", | ||
" print(f\"Added single cell count as metadata to {pathlib.Path(dest_path).name}!\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "interstellar_data", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.15" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.