Merge pull request #36 from MikeLippincott/rerun_cytotable_merge

Rerun cytotable merge
WayScience · Apr 23, 2024 · 11ad9b5 · 11ad9b5
2 parents a394acc + b2faf59
commit 11ad9b5
Show file tree

Hide file tree

Showing 30 changed files with 2,688 additions and 2,544 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,6 @@ Plate_2_data/3.cellprofiler_analysis/loaddata_by_celltype/SHSY5Y_cells_need_to_b
 
 # ignore folder with parquet files as they are too large and will be generated from run
 Plate_2_data/4.processing_features/data
+
+# parsl logs
+Plate_2_data/4.processing_features/runinfo/*
diff --git a/Plate_2_data/4.processing_features/0.merge_sc_plate2_PBMC.ipynb b/Plate_2_data/4.processing_features/0.merge_sc_plate2_PBMC.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Merge single cells from CellProfiler outputs using CytoTable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import pathlib\n",
+    "\n",
+    "from cytotable import convert, presets\n",
+    "\n",
+    "sys.path.append(\"../../utils\")\n",
+    "import sc_extraction_utils as sc_utils\n",
+    "from parsl.config import Config\n",
+    "from parsl.executors import HighThroughputExecutor"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set paths and variables\n",
+    "\n",
+    "All paths must be string but we use pathlib to show which variables are paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# type of file output from CytoTable (currently only parquet)\n",
+    "dest_datatype = \"parquet\"\n",
+    "\n",
+    "# directory where parquet files are saved to\n",
+    "output_dir = pathlib.Path(\"./data/converted_data\")\n",
+    "output_dir.mkdir(exist_ok=True, parents=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preset configurations based on typical CellProfiler outputs\n",
+    "preset = \"cellprofiler_sqlite_pycytominer\"\n",
+    "# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names\n",
+    "# add Image_Metadata_Site as this is an important metadata when finding where single cells are located\n",
+    "presets.config[\"cellprofiler_sqlite_pycytominer\"][\n",
+    "    \"CONFIG_JOINS\"\n",
+    "    # create filtered list of image features to be extracted and used for merging tables\n",
+    "    # with the list of image features, this will merge the objects together using the image number,\n",
+    "    # and parent objects to create all single cells (all objects associated to one cell)\n",
+    "] = \"\"\"WITH Per_Image_Filtered AS (\n",
+    "                SELECT\n",
+    "                    Metadata_ImageNumber,\n",
+    "                    Image_Metadata_Plate,\n",
+    "                    Image_Metadata_Well,\n",
+    "                    Image_Metadata_Site\n",
+    "                FROM\n",
+    "                    read_parquet('per_image.parquet')\n",
+    "                )\n",
+    "            SELECT\n",
+    "                *\n",
+    "            FROM\n",
+    "                Per_Image_Filtered AS per_image\n",
+    "            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON\n",
+    "                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber\n",
+    "            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON\n",
+    "                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n",
+    "                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells\n",
+    "            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON\n",
+    "                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber\n",
+    "                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei\n",
+    "                \"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/projects'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# set directory for sqlite files\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m sqlite_dir \u001b[38;5;241m=\u001b[39m \u001b[43mpathlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/projects/[email protected]/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m      4\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# dictionary with info for the sqlite file from each run\u001b[39;00m\n\u001b[1;32m      7\u001b[0m sqlite_info_dictionary \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_1\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\n\u001b[1;32m      9\u001b[0m         \u001b[38;5;66;03m# path to outputted SQLite file\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     70\u001b[0m     }   \n\u001b[1;32m     71\u001b[0m }\n",
+      "File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:1181\u001b[0m, in \u001b[0;36mPath.resolve\u001b[0;34m(self, strict)\u001b[0m\n\u001b[1;32m   1179\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_closed:\n\u001b[1;32m   1180\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_closed()\n\u001b[0;32m-> 1181\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flavour\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresolve\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1183\u001b[0m     \u001b[38;5;66;03m# No symlink resolution => for consistency, raise an error if\u001b[39;00m\n\u001b[1;32m   1184\u001b[0m     \u001b[38;5;66;03m# the path doesn't exist or is forbidden\u001b[39;00m\n\u001b[1;32m   1185\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstat()\n",
+      "File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:363\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve\u001b[0;34m(self, path, strict)\u001b[0m\n\u001b[1;32m    360\u001b[0m \u001b[38;5;66;03m# NOTE: according to POSIX, getcwd() cannot contain path components\u001b[39;00m\n\u001b[1;32m    361\u001b[0m \u001b[38;5;66;03m# which are symlinks.\u001b[39;00m\n\u001b[1;32m    362\u001b[0m base \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m path\u001b[38;5;241m.\u001b[39mis_absolute() \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetcwd()\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_resolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;129;01mor\u001b[39;00m sep\n",
+      "File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:347\u001b[0m, in \u001b[0;36m_PosixFlavour.resolve.<locals>._resolve\u001b[0;34m(path, rest)\u001b[0m\n\u001b[1;32m    345\u001b[0m \u001b[38;5;66;03m# Resolve the symbolic link\u001b[39;00m\n\u001b[1;32m    346\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 347\u001b[0m     target \u001b[38;5;241m=\u001b[39m \u001b[43maccessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnewpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    349\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m e\u001b[38;5;241m.\u001b[39merrno \u001b[38;5;241m!=\u001b[39m EINVAL \u001b[38;5;129;01mand\u001b[39;00m strict:\n",
+      "File \u001b[0;32m~/miniconda3/envs/interstellar_data/lib/python3.8/pathlib.py:452\u001b[0m, in \u001b[0;36m_NormalAccessor.readlink\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mreadlink\u001b[39m(\u001b[38;5;28mself\u001b[39m, path):\n\u001b[0;32m--> 452\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/projects'"
+     ]
+    }
+   ],
+   "source": [
+    "# set directory for sqlite files\n",
+    "# the path below is hardcoded to a local directory\n",
+    "# the data are large and stored on a local secondary drive\n",
+    "# TODO: Change this path for your local setup\n",
+    "sqlite_dir = pathlib.Path(\n",
+    "    \"/home/lippincm/Desktop/18T/interstellar_data/PBMC_SQLite_Outputs/\"\n",
+    ").resolve(strict=True)\n",
+    "\n",
+    "# dictionary with info for the sqlite file from each run\n",
+    "sqlite_info_dictionary = {\n",
+    "    \"batch_1\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_1.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_1.parquet\")),\n",
+    "    },\n",
+    "    \"batch_2\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_2.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_2.parquet\")),\n",
+    "    },\n",
+    "    \"batch_3\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_3.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_3.parquet\")),\n",
+    "    },\n",
+    "    \"batch_4\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_4.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_4.parquet\")),\n",
+    "    },\n",
+    "    \"batch_5\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_5.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_5.parquet\")),\n",
+    "    },\n",
+    "    \"batch_6\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_6.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_6.parquet\")),\n",
+    "    },\n",
+    "    \"batch_7\": {\n",
+    "        # path to outputted SQLite file\n",
+    "        \"source_path\": str(\n",
+    "            pathlib.Path(\n",
+    "                f\"{sqlite_dir}/PBMC_batch_7.sqlite\"\n",
+    "            )\n",
+    "        ),\n",
+    "        \"dest_path\": str(pathlib.Path(f\"{output_dir}/PBMC_batch_7.parquet\")),\n",
+    "    }   \n",
+    "}"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert SQLite file and merge single cells into parquet file\n",
+    "\n",
+    "This was not run to completion as we use the nbconverted python file for full run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run through each run with each set of paths based on dictionary\n",
+    "for sqlite_file, info in sqlite_info_dictionary.items():\n",
+    "    source_path = info[\"source_path\"]\n",
+    "    dest_path = info[\"dest_path\"]\n",
+    "    print(f\"Performing merge single cells and conversion on {sqlite_file}!\")\n",
+    "\n",
+    "    # merge single cells and output as parquet file\n",
+    "    convert(\n",
+    "        source_path=source_path,\n",
+    "        dest_path=dest_path,\n",
+    "        dest_datatype=dest_datatype,\n",
+    "        preset=preset,\n",
+    "        parsl_config=Config(\n",
+    "            executors=[HighThroughputExecutor()],\n",
+    "        ),\n",
+    "        chunk_size=10000,\n",
+    "    )\n",
+    "    print(f\"Merged and converted {pathlib.Path(dest_path).name}!\")\n",
+    "\n",
+    "    # add single cell count per well as metadata column to parquet file and save back to same path\n",
+    "    sc_utils.add_sc_count_metadata_file(\n",
+    "        data_path=dest_path, well_column_name=\"Image_Metadata_Well\", file_type=\"parquet\"\n",
+    "    )\n",
+    "    print(f\"Added single cell count as metadata to {pathlib.Path(dest_path).name}!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "interstellar_data",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}