Skip to content

Commit

Permalink
Merge pull request #36 from MikeLippincott/rerun_cytotable_merge
Browse files Browse the repository at this point in the history
Rerun cytotable merge
  • Loading branch information
MikeLippincott authored Apr 23, 2024
2 parents a394acc + b2faf59 commit 11ad9b5
Show file tree
Hide file tree
Showing 30 changed files with 2,688 additions and 2,544 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,6 @@ Plate_2_data/3.cellprofiler_analysis/loaddata_by_celltype/SHSY5Y_cells_need_to_b

# ignore folder with parquet files as they are too large and will be generated from run
Plate_2_data/4.processing_features/data

# parsl logs
Plate_2_data/4.processing_features/runinfo/*
254 changes: 254 additions & 0 deletions Plate_2_data/4.processing_features/0.merge_sc_plate2_PBMC.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge single cells from CellProfiler outputs using CytoTable"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pathlib\n",
"\n",
"from cytotable import convert, presets\n",
"\n",
"sys.path.append(\"../../utils\")\n",
"import sc_extraction_utils as sc_utils\n",
"from parsl.config import Config\n",
"from parsl.executors import HighThroughputExecutor"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set paths and variables\n",
"\n",
"All paths must be string but we use pathlib to show which variables are paths"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# type of file output from CytoTable (currently only parquet)\n",
"dest_datatype = \"parquet\"\n",
"\n",
"# directory where parquet files are saved to\n",
"output_dir = pathlib.Path(\"./data/converted_data\")\n",
"output_dir.mkdir(exist_ok=True, parents=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# preset configurations based on typical CellProfiler outputs\n",
"preset = \"cellprofiler_sqlite_pycytominer\"\n",
"# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names\n",
"# add Image_Metadata_Site as this is an important metadata when finding where single cells are located\n",
"presets.config[\"cellprofiler_sqlite_pycytominer\"][\n",
" \"CONFIG_JOINS\"\n",
" # create filtered list of image features to be extracted and used for merging tables\n",
" # with the list of image features, this will merge the objects together using the image number,\n",
" # and parent objects to create all single cells (all objects associated to one cell)\n",
"] = \"\"\"WITH Per_Image_Filtered AS (\n",
" SELECT\n",
" Metadata_ImageNumber,\n",
" Image_Metadata_Plate,\n",
" Image_Metadata_Well,\n",
" Image_Metadata_Site\n",
" FROM\n",
" read_parquet('per_image.parquet')\n",
" )\n",
" SELECT\n",
" *\n",
" FROM\n",
" Per_Image_Filtered AS per_image\n",
" LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON\n",
" per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber\n",
" LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON\n",
" per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n",
" AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells\n",
" LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON\n",
" per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n",
" AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei\n",
" \"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/projects'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# set directory for sqlite files\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m sqlite_dir \u001b[38;5;241m=\u001b[39m \u001b[43mpathlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/projects/[email protected]/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 4\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# dictionary with info for the sqlite file from each run\u001b[39;00m\n\u001b[1;32m 7\u001b[0m sqlite_info_dictionary \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_1\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# path to outputted SQLite file\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 70\u001b[0m } \n\u001b[1;32m 71\u001b[0m }\n",
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:1181\u001b[0m, in \u001b[0;36mPath.resolve\u001b[0;34m(self, strict)\u001b[0m\n\u001b[1;32m 1179\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_closed:\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_closed()\n\u001b[0;32m-> 1181\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flavour\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;66;03m# No symlink resolution => for consistency, raise an error if\u001b[39;00m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;66;03m# the path doesn't exist or is forbidden\u001b[39;00m\n\u001b[1;32m 1185\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstat()\n",
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:363\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve\u001b[0;34m(self, path, strict)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[38;5;66;03m# NOTE: according to POSIX, getcwd() cannot contain path components\u001b[39;00m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;66;03m# which are symlinks.\u001b[39;00m\n\u001b[1;32m 362\u001b[0m base \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m path\u001b[38;5;241m.\u001b[39mis_absolute() \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetcwd()\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_resolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;129;01mor\u001b[39;00m sep\n",
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:347\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve.<locals>._resolve\u001b[0;34m(path, rest)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;66;03m# Resolve the symbolic link\u001b[39;00m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 347\u001b[0m target \u001b[38;5;241m=\u001b[39m \u001b[43maccessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnewpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m e\u001b[38;5;241m.\u001b[39merrno \u001b[38;5;241m!=\u001b[39m EINVAL \u001b[38;5;129;01mand\u001b[39;00m strict:\n",
"File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:452\u001b[0m, in \u001b[0;36m_NormalAccessor.readlink\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mreadlink\u001b[39m(\u001b[38;5;28mself\u001b[39m, path):\n\u001b[0;32m--> 452\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/projects'"
]
}
],
"source": [
"# set directory for sqlite files\n",
"# the path below is hardcoded to a local directory\n",
"# the data are large and stored on a local secondary drive\n",
"# TODO: Change this path for your local setup\n",
"sqlite_dir = pathlib.Path(\n",
" \"/home/lippincm/Desktop/18T/interstellar_data/PBMC_SQLite_Outputs/\"\n",
").resolve(strict=True)\n",
"\n",
"# dictionary with info for the sqlite file from each run\n",
"sqlite_info_dictionary = {\n",
" \"batch_1\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_1.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_1.parquet\")),\n",
" },\n",
" \"batch_2\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_2.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_2.parquet\")),\n",
" },\n",
" \"batch_3\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_3.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_3.parquet\")),\n",
" },\n",
" \"batch_4\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_4.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_4.parquet\")),\n",
" },\n",
" \"batch_5\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_5.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_5.parquet\")),\n",
" },\n",
" \"batch_6\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_6.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_6.parquet\")),\n",
" },\n",
" \"batch_7\": {\n",
" # path to outputted SQLite file\n",
" \"source_path\": str(\n",
" pathlib.Path(\n",
" f\"{sqlite_dir}/PBMC_batch_7.sqlite\"\n",
" )\n",
" ),\n",
" \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_7.parquet\")),\n",
" } \n",
"}"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert SQLite file and merge single cells into parquet file\n",
"\n",
"This was not run to completion as we use the nbconverted python file for full run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# run through each run with each set of paths based on dictionary\n",
"for sqlite_file, info in sqlite_info_dictionary.items():\n",
" source_path = info[\"source_path\"]\n",
" dest_path = info[\"dest_path\"]\n",
" print(f\"Performing merge single cells and conversion on {sqlite_file}!\")\n",
"\n",
" # merge single cells and output as parquet file\n",
" convert(\n",
" source_path=source_path,\n",
" dest_path=dest_path,\n",
" dest_datatype=dest_datatype,\n",
" preset=preset,\n",
" parsl_config=Config(\n",
" executors=[HighThroughputExecutor()],\n",
" ),\n",
" chunk_size=10000,\n",
" )\n",
" print(f\"Merged and converted {pathlib.Path(dest_path).name}!\")\n",
"\n",
" # add single cell count per well as metadata column to parquet file and save back to same path\n",
" sc_utils.add_sc_count_metadata_file(\n",
" data_path=dest_path, well_column_name=\"Image_Metadata_Well\", file_type=\"parquet\"\n",
" )\n",
" print(f\"Added single cell count as metadata to {pathlib.Path(dest_path).name}!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "interstellar_data",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 11ad9b5

Please sign in to comment.