Skip to content
This repository has been archived by the owner on Aug 20, 2024. It is now read-only.

Duplicate checking #72

Merged
merged 2 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/samplesheets/fromSamplesheet.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ This parameter be described in the Nextflow parameter schema using as a file, wi

The given sample sheet schema specified in the `schema` key is then loaded and used for validation and sample sheet generation.

An additional function optional argument can be used:
Some additional function optional arguments can be used:

- File name for the pipeline parameters schema. (Default: `nextflow_schema.json`)
- `schema_filename`: File name for the pipeline parameters schema. (Default: `nextflow_schema.json`)
- `skip_duplicate_check`: Skip the checking for duplicates. Can also be skipped with the `--validationSkipDuplicateCheck` parameter. (Default: `false`)

```groovy
Channel.fromSamplesheet('input')
Expand All @@ -35,6 +36,7 @@ Channel.fromSamplesheet('input')
Channel.fromSamplesheet(
'input',
schema_filename: 'custom_nextflow_schema.json',
skip_duplicate_check: false
)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class SamplesheetConverter {
private static List<String> schemaErrors = []
private static List<String> warnings = []

private static List<Map> rows = []

static boolean hasErrors() { errors.size()>0 }
static Set<String> getErrors() { errors.sort().collect { "\t${it}".toString() } as Set }

Expand All @@ -54,7 +56,8 @@ class SamplesheetConverter {

static List convertToList(
Path samplesheetFile,
Path schemaFile
Path schemaFile,
Boolean skipDuplicateCheck
) {

def Map schemaMap = (Map) new JsonSlurper().parseText(schemaFile.text)
Expand All @@ -78,14 +81,15 @@ class SamplesheetConverter {
def Map<String,List<String>> booleanUniques = [:]
def Map<String,List<Map<String,String>>> listUniques = [:]
def Boolean headerCheck = true
this.rows = []
resetCount()

def List outputs = samplesheetList.collect { Map<String,String> fullRow ->
increaseCount()

Map<String,String> row = fullRow.findAll { it.value != "" }
def Set rowKeys = row.keySet()
def String yamlInfo = fileType == "yaml" ? " for sample ${this.getCount()}." : ""
def String yamlInfo = fileType == "yaml" ? " for entry ${this.getCount()}." : ""

// Check the header (CSV/TSV) or present fields (YAML)
if(headerCheck) {
Expand All @@ -99,6 +103,13 @@ class SamplesheetConverter {
}
}

// Check for row uniqueness
if(!skipDuplicateCheck && this.rows.contains(row)) {
def Integer firstDuplicate = this.rows.findIndexOf { it == row }
this.errors << "The samplesheet contains duplicate rows for entry ${firstDuplicate + 1} and entry ${getCount()} (${row})".toString()
}
this.rows.add(row)

def Map meta = [:]
def ArrayList output = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class SchemaValidator extends PluginExtensionPoint {

// Set defaults for optional inputs
def String schemaFilename = options?.containsKey('schema_filename') ? options.schema_filename as String : 'nextflow_schema.json'
def Boolean skipDuplicateCheck = options?.containsKey('skip_duplicate_check') ? options.skip_duplicate_check as Boolean : params.validationSkipDuplicateCheck ? params.validationSkipDuplicateCheck as Boolean : false

def slurper = new JsonSlurper()
def Map parsed = (Map) slurper.parse( Path.of(getSchemaPath(baseDir, schemaFilename)) )
Expand Down Expand Up @@ -194,7 +195,7 @@ class SchemaValidator extends PluginExtensionPoint {

// Convert to channel
final channel = CH.create()
List arrayChannel = SamplesheetConverter.convertToList(samplesheetFile, schemaFile)
List arrayChannel = SamplesheetConverter.convertToList(samplesheetFile, schemaFile, skipDuplicateCheck)
session.addIgniter {
arrayChannel.each {
channel.bind(it)
Expand Down Expand Up @@ -227,6 +228,9 @@ class SchemaValidator extends PluginExtensionPoint {
if( !params.containsKey("validationSchemaIgnoreParams") ) {
params.validationSchemaIgnoreParams = false
}
if( !params.containsKey("validationSkipDuplicateCheck") ) {
params.validationSkipDuplicateCheck = false
}

return params
}
Expand All @@ -236,7 +240,15 @@ class SchemaValidator extends PluginExtensionPoint {
// Add expected params
//
List addExpectedParams() {
def List expectedParams = ["validationFailUnrecognisedParams", "validationLenientMode", "monochrome_logs", "help", "validationShowHiddenParams", "validationSchemaIgnoreParams"]
def List expectedParams = [
"validationFailUnrecognisedParams",
"validationLenientMode",
"monochrome_logs",
"help",
"validationShowHiddenParams",
"validationSchemaIgnoreParams",
"validationSkipDuplicateCheck"
]

return expectedParams
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class SamplesheetConverterTest extends Dsl2Spec{
!stdout
}

def 'errors before channel conversion' () {
def 'errors before channel conversion' () {
given:
def SCRIPT_TEXT = '''
include { fromSamplesheet } from 'plugin/nf-validation'
Expand Down Expand Up @@ -250,4 +250,31 @@ class SamplesheetConverterTest extends Dsl2Spec{
errorMessages[12] == "* -- Entry 3 - field_2: expected type: Integer, found: String (false)"
!stdout
}

def 'duplicates' () {
given:
def SCRIPT_TEXT = '''
include { fromSamplesheet } from 'plugin/nf-validation'

params.input = 'src/testResources/duplicate.csv'

workflow {
Channel.fromSamplesheet("input", schema_filename:"src/testResources/nextflow_schema_with_samplesheet_converter.json").view()
}
'''

when:
dsl_eval(SCRIPT_TEXT)
def stdout = capture
.toString()
.readLines()
.findResults {it.startsWith('[[') ? it : null }

then:
def error = thrown(SchemaValidationException)
def errorMessages = error.message.readLines()
errorMessages[0] == "Samplesheet errors:"
errorMessages[4] == "\tThe samplesheet contains duplicate rows for entry 2 and entry 3 ([field_4:string1, field_5:25, field_6:false])"
!stdout
}
}
4 changes: 4 additions & 0 deletions plugins/nf-validation/src/testResources/duplicate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11
fullField,10,true,string1,25,false,src/testResources/test.txt,src/testResources/testDir,src/testResources/test.txt,unique1,1
,,,string1,25,false,,,,,,
,,,string1,25,false,,,,,,