diff --git a/Documentation/detailed_overview.md b/Documentation/detailed_overview.md index 00ad80d..5aa3f2f 100644 --- a/Documentation/detailed_overview.md +++ b/Documentation/detailed_overview.md @@ -6,7 +6,7 @@ ## Library Structure -![Layout of snm3C library sequences.](../Documentation/library_structure.png) +![Layout of snmCT library sequences.](../Documentation/library_structure.png) Each read contains either bisulfite-converted DNA sequence *or* fully methylated transcriptomic cDNA (these are separated _in silico_). For expected nucleotide sequences, please see our library's [seqspec](https://igvf.github.io/seqspec/specs/snmCTseq/spec.html). diff --git a/Documentation/revision_history.md b/Documentation/revision_history.md index 2f7cbda..127412c 100644 --- a/Documentation/revision_history.md +++ b/Documentation/revision_history.md @@ -1,5 +1,13 @@ # Public Release, v1 + +# version 1.2 + +## v1.2.1 + +Minor fix: commenting out parts of A00d, A01b, A01c that should only be run if executing interactively (code chunk starting with "if running interactively [...] os.environ [...]") + + ## v1.2.0 **Major changes:** @@ -28,6 +36,7 @@ - metadata aggregation functions were not all robust to missing inputs. now checks if the file exists and skips files as needed. - summarizes numbers of missing values and missing filepaths to help investigate whether particular batches were incomplete. +# version 1.1 ## v1.1.3 @@ -65,6 +74,8 @@ Minor fixes: - minor formatting updates, added more comments in Notebooks & D - unfortunately, notebook metadata cleaning not working (e.g., cellid, ipykernel names updated -- ignore these in the commit) +# version 1.0 + ## v1.1.0 - Added this documentation, public Github in prep for production-scale consortium work. - Parameters now kept in one environmental variable file (`snmCT_parameters.env`) in lieu of specifying parameters like `$projdir` that were repeated at the start of almost every script. This requires an `xargs --> export` command & `import os` to read environmental variables for certain python scripts. @@ -77,6 +88,7 @@ Minor fixes: - Some code now clunkier (reliance on `${basharrays[@]}`), but helps with failed job checking and decreases the degree of user input required at each stage. # Prototypes, v0 + * **v0.5:** Changes to .bam inputs for STAR mCH/CH (although not the underlying filtering script). - Some issues with folks using different versions of `subread`/`featureCounts` (tested on v2.0.1). Later versions more stringent, e.g., throw errors instead of warnings if a mixture of paired-end and single-end reads are input for quantification. - Relevant because "mapping singletons" (e.g., R2 aligns but R1 does not) were previously included in the PE-alignments and fed into featureCounts along with proper pairs; unclear based on developer comments if these were correctly quantified. To address this, we explicitly now subset to only proper pairs before filtering and `featureCounts`. diff --git a/Notebooks/A00_environment_and_genome_setup.ipynb b/Notebooks/A00_environment_and_genome_setup.ipynb index e56e6f2..ff80bcd 100755 --- a/Notebooks/A00_environment_and_genome_setup.ipynb +++ b/Notebooks/A00_environment_and_genome_setup.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "a2c0bb48", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c092f422", "metadata": {}, "outputs": [], @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "23fe91be", "metadata": {}, "outputs": [], @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "0778a67b", "metadata": {}, "outputs": [], @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "d57f4a64", "metadata": {}, "outputs": [], @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "4b0ee576", "metadata": {}, "outputs": [], @@ -307,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "a082de88", "metadata": {}, "outputs": [], @@ -376,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "e6b9de87", "metadata": {}, "outputs": [], @@ -460,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "699c6112", "metadata": {}, "outputs": [], @@ -536,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "e65bab1d", "metadata": {}, "outputs": [], @@ -563,19 +563,19 @@ "\n", "\n", "\n", - "# if running interactively, need to load some lines from snmCT_parameters.env\n", - "# os.environ['ref_dir'] = \"/u/project/cluo/chliu/Genomes/human_gencode_v40\" or the below loop\n", - "# (use absolute versus relative path of parameters.env file if below not working!)\n", - "envvar_needed = ['dir_proj', 'ref_dir', 'ref_gtf', 'ref_chromsizes']\n", - "try:\n", - " os.environ['ref_dir']\n", - "except KeyError:\n", - " envspec = pd.read_csv(\"../snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", - " ).set_axis(['varname', 'varpath'], axis = 1\n", - " ).query('varname in @envvar_needed')\n", - " for index, row in envspec.iterrows():\n", - " os.environ[row[\"varname\"]] = row[\"varpath\"]\n", - "os.chdir(os.environ['dir_proj'])\n", + "# # if running interactively, need to load some lines from snmCT_parameters.env\n", + "# # os.environ['ref_dir'] = \"/u/project/cluo/chliu/Genomes/human_gencode_v40\" or the below loop\n", + "# # (use absolute versus relative path of parameters.env file if below not working!)\n", + "# envvar_needed = ['dir_proj', 'ref_dir', 'ref_gtf', 'ref_chromsizes']\n", + "# try:\n", + "# os.environ['ref_dir']\n", + "# except KeyError:\n", + "# envspec = pd.read_csv(\"../snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", + "# ).set_axis(['varname', 'varpath'], axis = 1\n", + "# ).query('varname in @envvar_needed')\n", + "# for index, row in envspec.iterrows():\n", + "# os.environ[row[\"varname\"]] = row[\"varpath\"]\n", + "# os.chdir(os.environ['dir_proj'])\n", "\n", "\n", "\n", @@ -679,8 +679,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/Notebooks/A01_mergefastq_preptargets.ipynb b/Notebooks/A01_mergefastq_preptargets.ipynb index 9d86bdd..3f46132 100755 --- a/Notebooks/A01_mergefastq_preptargets.ipynb +++ b/Notebooks/A01_mergefastq_preptargets.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -28,71 +28,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "number of .fastq.gz files\n", - "256\n", - "\n", - "\n", - "\n", - "example .fastq.gz names\n", - "20231204-C29D14-Pos1-B01_S13_L001_R1_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L001_R2_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L002_R1_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L002_R2_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L003_R1_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L003_R2_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L004_R1_001.fastq.gz\n", - "20231204-C29D14-Pos1-B01_S13_L004_R2_001.fastq.gz\n", - "20231204-C29D16-Pos1-C03_S27_L001_R1_001.fastq.gz\n", - "20231204-C29D16-Pos1-C03_S27_L001_R2_001.fastq.gz\n", - "\n", - "\n", - "\n", - "Nlanes\tplatename\n", - " 4 20231204-C29D14-Pos1-B01_S13\n", - " 4 20231204-C29D16-Pos1-C03_S27\n", - " 4 20231204-C29D18-Pos1-B11_S23\n", - " 4 20231204-C29D1-Pos1-A07_S7\n", - " 4 20231204-C29D3-Pos1-B09_S21\n", - " 4 20231204-C29D7-Pos1-C05_S29\n", - " 4 20231204-C29D9-Pos1-A09_S9\n", - " 4 20231204-C37D0-Pos1-C01_S25\n", - " 4 20231204-C37D1-Pos1-B10_S22\n", - " 4 20231204-C37D3-Pos1-C07_S31\n", - " 4 20231204-C37D5-Pos1-A08_S8\n", - " 4 20231204-C37D9-Pos1-B06_S18\n", - " 4 20231204-C38D0-Pos1-C06_S30\n", - " 4 20231204-C38D12-Pos1-B04_S16\n", - " 4 20231204-C38D1-Pos1-B02_S14\n", - " 4 20231204-C38D3-Pos1-B07_S19\n", - " 4 20231204-C38D5-Pos1-A11_S11\n", - " 4 20231204-C38D7-Pos1-C02_S26\n", - " 4 20231204-C38D9-Pos1-A02_S2\n", - " 4 20231204-C38D9-Pos2-A05_S5\n", - " 4 20231204-C39D0-Pos1-A10_S10\n", - " 4 20231204-C39D12-Pos1-C08_S32\n", - " 4 20231204-C39D14-Pos1-B05_S17\n", - " 4 20231204-C39D16-Pos1-A01_S1\n", - " 4 20231204-C39D16-Pos2-A04_S4\n", - " 4 20231204-C39D18-Pos1-B03_S15\n", - " 4 20231204-C39D1-Pos1-C04_S28\n", - " 4 20231204-C39D3-Pos1-B08_S20\n", - " 4 20231204-C39D5-Pos1-A03_S3\n", - " 4 20231204-C39D5-Pos2-A06_S6\n", - " 4 20231204-C39D7-Pos1-B12_S24\n", - " 4 20231204-C39D9-Pos1-A12_S12\n", - "\n", - "Nplates:\n", - "32\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "# before proceeding, also check naming convention of\n", @@ -137,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -239,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -297,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -324,19 +262,19 @@ "import pandas as pd\n", "import os\n", "\n", - "# if running interactively, need to load some lines from snmCT_parameters.env\n", - "# or manually spec os.environ -- e.g., via os.environ['dir_proj'] = \"mydirectory\" or this below loop\n", - "# (check relative path of parameters.env file or change to absolute if below not working!)\n", - "envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate']\n", - "try:\n", - " os.environ['dir_proj']\n", - "except KeyError:\n", - " envspec = pd.read_csv(\"../snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", - " ).set_axis(['varname', 'varpath'], axis = 1\n", - " ).query('varname in @envvar_needed')\n", - " for index, row in envspec.iterrows():\n", - " os.environ[row[\"varname\"]] = row[\"varpath\"]\n", - "os.chdir(os.environ['dir_proj'])\n", + "# # if running interactively, need to load some lines from snmCT_parameters.env\n", + "# # or manually spec os.environ -- e.g., via os.environ['dir_proj'] = \"mydirectory\" or this below loop\n", + "# # (check relative path of parameters.env file or change to absolute if below not working!)\n", + "# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate']\n", + "# try:\n", + "# os.environ['dir_proj']\n", + "# except KeyError:\n", + "# envspec = pd.read_csv(\"../snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", + "# ).set_axis(['varname', 'varpath'], axis = 1\n", + "# ).query('varname in @envvar_needed')\n", + "# for index, row in envspec.iterrows():\n", + "# os.environ[row[\"varname\"]] = row[\"varpath\"]\n", + "# os.chdir(os.environ['dir_proj'])\n", "\n", "\n", "\n", @@ -389,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -411,19 +349,19 @@ "import numpy as np\n", "import os\n", "\n", - "# if running interactively, need to load some lines from snmCT_parameters.env\n", - "# or manually spec os.environ -- e.g., via the below loop\n", - "# (use absolute versus relative path of parameters.env file if below not working!)\n", - "envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate', 'metadat_well']\n", - "try:\n", - " os.environ['metadat_well']\n", - "except KeyError:\n", - " envspec = pd.read_csv(\"snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", - " ).set_axis(['varname', 'varpath'], axis = 1\n", - " ).query('varname in @envvar_needed')\n", - " for index, row in envspec.iterrows():\n", - " os.environ[row[\"varname\"]] = row[\"varpath\"]\n", - "os.chdir(os.environ['dir_proj'])\n", + "# # if running interactively, need to load some lines from snmCT_parameters.env\n", + "# # or manually spec os.environ -- e.g., via the below loop\n", + "# # (use absolute versus relative path of parameters.env file if below not working!)\n", + "# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate', 'metadat_well']\n", + "# try:\n", + "# os.environ['metadat_well']\n", + "# except KeyError:\n", + "# envspec = pd.read_csv(\"snmCT_parameters.env\", sep = \"=\", comment=\"#\", header = None\n", + "# ).set_axis(['varname', 'varpath'], axis = 1\n", + "# ).query('varname in @envvar_needed')\n", + "# for index, row in envspec.iterrows():\n", + "# os.environ[row[\"varname\"]] = row[\"varpath\"]\n", + "# os.chdir(os.environ['dir_proj'])\n", "\n", "\n", "\n", @@ -626,8 +564,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/Scripts/A00d_gtf_annotations_bed.py b/Scripts/A00d_gtf_annotations_bed.py index 9933315..e50ed1e 100644 --- a/Scripts/A00d_gtf_annotations_bed.py +++ b/Scripts/A00d_gtf_annotations_bed.py @@ -18,19 +18,19 @@ -# if running interactively, need to load some lines from snmCT_parameters.env -# os.environ['ref_dir'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40" or the below loop -# (use absolute versus relative path of parameters.env file if below not working!) -envvar_needed = ['dir_proj', 'ref_dir', 'ref_gtf', 'ref_chromsizes'] -try: - os.environ['ref_dir'] -except KeyError: - envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None - ).set_axis(['varname', 'varpath'], axis = 1 - ).query('varname in @envvar_needed') - for index, row in envspec.iterrows(): - os.environ[row["varname"]] = row["varpath"] -os.chdir(os.environ['dir_proj']) +# # if running interactively, need to load some lines from snmCT_parameters.env +# # os.environ['ref_dir'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40" or the below loop +# # (use absolute versus relative path of parameters.env file if below not working!) +# envvar_needed = ['dir_proj', 'ref_dir', 'ref_gtf', 'ref_chromsizes'] +# try: +# os.environ['ref_dir'] +# except KeyError: +# envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None +# ).set_axis(['varname', 'varpath'], axis = 1 +# ).query('varname in @envvar_needed') +# for index, row in envspec.iterrows(): +# os.environ[row["varname"]] = row["varpath"] +# os.chdir(os.environ['dir_proj']) diff --git a/Scripts/A01b_plate_metadata.py b/Scripts/A01b_plate_metadata.py index eee6d09..dde25f5 100644 --- a/Scripts/A01b_plate_metadata.py +++ b/Scripts/A01b_plate_metadata.py @@ -19,19 +19,19 @@ import pandas as pd import os -# if running interactively, need to load some lines from snmCT_parameters.env -# or manually spec os.environ -- e.g., via os.environ['dir_proj'] = "mydirectory" or this below loop -# (check relative path of parameters.env file or change to absolute if below not working!) -envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate'] -try: - os.environ['dir_proj'] -except KeyError: - envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None - ).set_axis(['varname', 'varpath'], axis = 1 - ).query('varname in @envvar_needed') - for index, row in envspec.iterrows(): - os.environ[row["varname"]] = row["varpath"] -os.chdir(os.environ['dir_proj']) +# # if running interactively, need to load some lines from snmCT_parameters.env +# # or manually spec os.environ -- e.g., via os.environ['dir_proj'] = "mydirectory" or this below loop +# # (check relative path of parameters.env file or change to absolute if below not working!) +# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate'] +# try: +# os.environ['dir_proj'] +# except KeyError: +# envspec = pd.read_csv("../snmCT_parameters.env", sep = "=", comment="#", header = None +# ).set_axis(['varname', 'varpath'], axis = 1 +# ).query('varname in @envvar_needed') +# for index, row in envspec.iterrows(): +# os.environ[row["varname"]] = row["varpath"] +# os.chdir(os.environ['dir_proj']) diff --git a/Scripts/A01c_well_filepaths.py b/Scripts/A01c_well_filepaths.py index d1bbf06..5ab4f4d 100644 --- a/Scripts/A01c_well_filepaths.py +++ b/Scripts/A01c_well_filepaths.py @@ -14,19 +14,19 @@ import numpy as np import os -# if running interactively, need to load some lines from snmCT_parameters.env -# or manually spec os.environ -- e.g., via the below loop -# (use absolute versus relative path of parameters.env file if below not working!) -envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate', 'metadat_well'] -try: - os.environ['metadat_well'] -except KeyError: - envspec = pd.read_csv("snmCT_parameters.env", sep = "=", comment="#", header = None - ).set_axis(['varname', 'varpath'], axis = 1 - ).query('varname in @envvar_needed') - for index, row in envspec.iterrows(): - os.environ[row["varname"]] = row["varpath"] -os.chdir(os.environ['dir_proj']) +# # if running interactively, need to load some lines from snmCT_parameters.env +# # or manually spec os.environ -- e.g., via the below loop +# # (use absolute versus relative path of parameters.env file if below not working!) +# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate', 'metadat_well'] +# try: +# os.environ['metadat_well'] +# except KeyError: +# envspec = pd.read_csv("snmCT_parameters.env", sep = "=", comment="#", header = None +# ).set_axis(['varname', 'varpath'], axis = 1 +# ).query('varname in @envvar_needed') +# for index, row in envspec.iterrows(): +# os.environ[row["varname"]] = row["varpath"] +# os.chdir(os.environ['dir_proj'])