Merge branch 'sample_id_check' into 'dev'

Sample id check See merge request epi2melabs/workflows/wf-single-cell!10
epi2me-labs · Sep 7, 2022 · dea1979 · dea1979
2 parents 27c5b20 + 6b694b6
commit dea1979
Show file tree

Hide file tree

Showing 12 changed files with 238 additions and 216 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.1.2]
+### Added
+- check for identical sample_ids in single cell sample sheet and fastq data.
+
 ## [v0.1.1]
 ### Changed
 - Combined gathering and splitting of fastqs into a single process.

diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ The main options are:
 The single_cell_sample_sheet contains details about the input sample_ids, the 10X kits used (e.g. `3prime` or `5prime`), the kit versions used (`v2` or `v3` for the 3' kit, `v1` for the 5' kit), a rough estimate of the number of cells in the library. The cell count estimate specified with `exp_cells` and can be a very rough estimate (500 is a robust default value if the number is not known).
 
 
-The sample_id field should correspond to sample_id which is defined either in the `sample_sheet`,  given by the `sample` parameter (for single sample runs) or, if no `sample_sheet` or `sample` is given, is derived from each folder containing the fastq files.
+The sample_id field should correspond to sample_id which is defined either in the `sample_sheet`,  given by the `sample` parameter (for single sample runs). If no `sample_sheet` or `sample` is given, sample_id is derived from each folder containing the fastq files or if a single file is given, the sample_id is the basename of the file (data.fastq.gz -> data).
 
 An example sheet with one sample is:
 ```

diff --git a/bin/ping.py b/bin/ping.py
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -34,7 +34,7 @@ The main options are:
 The single_cell_sample_sheet contains details about the input sample_ids, the 10X kits used (e.g. `3prime` or `5prime`), the kit versions used (`v2` or `v3` for the 3' kit, `v1` for the 5' kit), a rough estimate of the number of cells in the library. The cell count estimate specified with `exp_cells` and can be a very rough estimate (500 is a robust default value if the number is not known).
 
 
-The sample_id field should correspond to sample_id which is defined either in the `sample_sheet`,  given by the `sample` parameter (for single sample runs) or, if no `sample_sheet` or `sample` is given, is derived from each folder containing the fastq files.
+The sample_id field should correspond to sample_id which is defined either in the `sample_sheet`,  given by the `sample` parameter (for single sample runs). If no `sample_sheet` or `sample` is given, sample_id is derived from each folder containing the fastq files or if a single file is given, the sample_id is the basename of the file (data.fastq.gz -> data).
 
 An example sheet with one sample is:
 ```

diff --git a/environment.yaml b/environment.yaml
@@ -23,4 +23,6 @@ dependencies:
   - pysam>=0.16.0
   - vsearch==2.15.1
   - umap-learn==0.5.2
-  - fastcat
+  - fastcat
+  - stringtie==2.2.1
+  - gffread
diff --git a/lib/Pinguscript.groovy b/lib/Pinguscript.groovy
@@ -0,0 +1,57 @@
+import static groovy.json.JsonOutput.toJson
+import groovy.json.JsonBuilder
+import groovy.json.JsonSlurper
+
+
+class Pinguscript {
+      public static String ping_post(workflow, message, error_message, out_dir, params) {
+        def msgId = UUID.randomUUID().toString();
+        def hosthash = null;
+        try { 
+            hosthash = InetAddress.getLocalHost().getHostName().md5()
+        } catch(Exception e) {
+            hosthash = "Unavailable"
+        }
+        def opsys = System.properties['os.name'].toLowerCase()
+        if (System.properties['os.version'].toLowerCase().contains("wsl")){
+            opsys = "WSL"
+        }
+        def workflow_name = "$workflow.manifest.name"
+        def session = "$workflow.sessionId" 
+        def errorMessage = "$error_message"
+        def profile = "$workflow.profile"
+        def filename =  "$out_dir/params.json"
+        File fileb = new File(filename)
+        def any_other_data = [:]
+        if (fileb.exists() && "$message" != "start") {
+            def jsonSlurper = new JsonSlurper()
+            any_other_data = jsonSlurper.parse(fileb)
+        } 
+        def meta_json = new JsonBuilder()
+        def agent = "$params.wf.agent"
+        def meta = meta_json "error": errorMessage.toString(), "profile": profile.toString(),
+            "agent": agent.toString()
+        meta+=any_other_data
+        def ping_version = '2.0.1'
+        def tracking_json = new JsonBuilder()
+        def tracking_id = tracking_json "msg_id": msgId, "version": ping_version
+        def data_json = new JsonBuilder()
+        def data = data_json "workflow": workflow_name.toString(),
+                "message": message, "meta": meta
+        def body_json = new JsonBuilder()
+        def root = body_json "tracking_id": tracking_id,  "hostname": hosthash.toString(), "os": opsys.toString(),
+                "session": session.toString(), "data": data,  "source": "workflow"
+        String postResult
+        ((HttpURLConnection)new URL('https://ping.oxfordnanoportal.com/epilaby').openConnection()).with({
+            requestMethod = 'POST'
+            doOutput = true
+            setRequestProperty('Content-Type', 'application/json') 
+            setRequestProperty('accept', 'application/json')
+            outputStream.withPrintWriter({printWriter ->
+                 printWriter.write(body_json.toString())
+            })
+            postResult = inputStream.text
+        })
+        return (postResult)
+    }
+}
diff --git a/lib/ping.nf b/lib/ping.nf
diff --git a/main.nf b/main.nf
@@ -27,6 +27,7 @@ process summariseCatChunkReads {
     cpus 1
     input:
         tuple path(directory), val(meta)
+        val check  // This will not exist if the sample_id check fails and will halt the pipleine.
     output:
         tuple val("${meta.sample_id}"), path("${meta.sample_id}.stats"), emit: stats
         tuple val("${meta.sample_id}"), path("chunks/*"), emit: fastq_chunks
@@ -102,6 +103,32 @@ process pack_images {
     """
 }
 
+process check_sampleids{
+    // Check that sample_ids gicven in the single_cell_sample_sheet are 
+    // identical to the sample_ids of the fastq inputs
+    label "singlecell"
+    input:
+        path fastqingress_ids
+        path sc_sample_sheet_ids
+    output:
+        // env check_sampleids_PASSED, emit: passed
+        path 'diff', optional: true, emit: diff
+    """
+    #!/usr/bin/env python
+    import pandas as pd
+    import sys
+    df_s = pd.read_csv("$fastqingress_ids", index_col=None)
+    df_f = pd.read_csv("$sc_sample_sheet_ids", index_col=None)
+
+    if set(df_s.iloc[:, 0].values) == set(df_f.iloc[:, 0].values):
+        print('Success. The sample_ids are the same')
+        open('diff', 'w').close()
+    else:
+        print("The smaples are different")
+        sys.stdout.write('ksfhdskhjfsdkjhksjdaskjd')
+    """
+}
+
 
 // workflow module
 workflow pipeline {
@@ -110,6 +137,8 @@ workflow pipeline {
         sc_sample_sheet
         ref_genome_dir
         umap_genes
+        sample_kits
+        sample_ids_check
 
     main:
         ref_genome_fasta = file("${ref_genome_dir}/fasta/genome.fa", checkIfExists: true)
@@ -123,16 +152,8 @@ workflow pipeline {
         }
 
         bc_longlist_dir = file("${projectDir}/data", checkIfExists: true)
-
-        sample_kits = Channel.fromPath(sc_sample_sheet)
-                    .splitCsv(header:true)
-                    .map { row -> tuple(
-                              row.sample_id, 
-                              row.kit_name, 
-                              row.kit_version,
-                              row.exp_cells)}
 
-        summariseCatChunkReads(reads)
+        summariseCatChunkReads(reads, sample_ids_check)
 
         stranding(
             summariseCatChunkReads.out.fastq_chunks,
@@ -153,18 +174,28 @@ workflow pipeline {
             ref_genes_gtf,
             umap_genes,
             bc_longlist_dir,
-            sample_kits
+            sample_kits,
+            ref_genome_fasta
         )
     emit:
         results = process_bams.out.results
         umap_plots = process_bams.out.umap_plots
         config_stats = stranding.out.config_stats
+
 }
 
 
 // entrypoint workflow
 WorkflowMain.initialise(workflow, params, log)
 workflow {
+
+    if (params.disable_ping == false) {
+        try { 
+            Pinguscript.ping_post(workflow, "start", "none", params.out_dir, params)
+        } catch(RuntimeException e1) {
+        }
+    }
+
     sc_sample_sheet = file(params.single_cell_sample_sheet, checkIfExists: true)
     ref_genome_dir = file(params.ref_genome_dir, checkIfExists: true)
     umap_genes = file(params.umap_plot_genes, checkIfExists: true)
@@ -177,8 +208,32 @@ workflow {
             "sample_sheet":params.sample_sheet,
             "sanitize": params.sanitize_fastq,
             "output":params.out_dir])
+
+    sample_kits = Channel.fromPath(sc_sample_sheet)
+                    .splitCsv(header:true)
+                    .map { row -> tuple(
+                              row.sample_id, 
+                              row.kit_name, 
+                              row.kit_version,
+                              row.exp_cells)}
+
+    fastqingress_ids = reads.map{it -> it[1]['sample_id']}
+    .collectFile(name: 'fastingress_read_ids.csv', newLine: true)
+
+    sample_kit_ids = sample_kits.map{it -> it[0]}
+        .collectFile(name: 'sc_sample_sheet_ids.csv', newLine: true)    
+
+    check_sampleids(fastqingress_ids, sample_kit_ids)
+
+    check_sampleids.out.ifEmpty{
+        exit 1,
+        """
+        The sample_ids in the single_cell_sample_sheet do not match those
+        of the fastq inputs. Please see the README for instructions.""".stripIndent()}
 
-    pipeline(reads, sc_sample_sheet, ref_genome_dir, umap_genes)
+    // first() converts the queue channel to a value channel.
+    pipeline(reads, sc_sample_sheet, ref_genome_dir, umap_genes, sample_kits,
+        check_sampleids.out.first())
 
     pack_images(pipeline.out.umap_plots)
 
@@ -197,3 +252,20 @@ workflow {
     makeReport()
     output_report(makeReport.out)
 }
+
+if (params.disable_ping == false) {
+workflow.onComplete {
+    try{
+        Pinguscript.ping_post(workflow, "end", "none", params.out_dir, params)
+    }catch(RuntimeException e1) {
+    }
+}
+
+    workflow.onError {
+        try{
+            Pinguscript.ping_post(workflow, "error", "$workflow.errorMessage", params.out_dir, params)
+        }catch(RuntimeException e1) {
+        }
+    }
+
+}
diff --git a/nextflow.config b/nextflow.config
@@ -9,7 +9,6 @@
 //
 // for further help editing this file.
 
-
 params {
     help = false
     fastq = null
@@ -18,13 +17,14 @@ params {
     sample = null
     single_cell_sample_sheet = null
     sanitize_fastq = false
-    wfversion = "v0.1.1"
+    wfversion = "v0.1.2"
     aws_image_prefix = null
     aws_queue = null
     report_name = "report"
     disable_ping = false
     kit_config = null
     max_threads = 4
+
 
     ref_genome_dir = null
     read_structure_batch_size = 40000
@@ -160,4 +160,4 @@ report {
 trace {
   enabled = true
   file = "${params.out_dir}/execution/trace.txt"
-}
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -222,7 +222,7 @@
         },
         "wfversion": {
             "type": "string",
-            "default": "v0.1.1",
+            "default": "v0.1.2",
             "hidden": true
         },
         "monochrome_logs": {