Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new module : jvarkit/vcffilterjdk #6621

Merged
merged 24 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c088a57
vcffilterdjdk
lindenb Sep 10, 2024
979ab53
update params
lindenb Sep 10, 2024
6ed3f43
update params
lindenb Sep 10, 2024
a6af30a
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 10, 2024
202e632
oppsss tag and TODO
lindenb Sep 10, 2024
e7e6aca
Merge branch 'pl_jvarkit_vcffilterjdk' of https://github.com/lindenb/…
lindenb Sep 10, 2024
ca16e60
target/region
lindenb Sep 12, 2024
19dbc29
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 12, 2024
139f36a
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 13, 2024
9650802
answers to review
lindenb Sep 13, 2024
5fd36e8
f...g space
lindenb Sep 13, 2024
320a657
Merge branch 'pl_jvarkit_vcffilterjdk' of https://github.com/lindenb/…
lindenb Sep 13, 2024
4d4745b
fix conda problem https://nfcore.slack.com/archives/CJRH30T6V/p172623…
lindenb Sep 13, 2024
2fe0b8f
Merge branch 'master' into pl_jvarkit_vcffilterjdk
jfy133 Sep 16, 2024
7aa618a
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 17, 2024
6a81fc6
add test+bed
lindenb Sep 17, 2024
e128738
reset polyx
lindenb Sep 17, 2024
02431f5
prevent test exception md5sum for empty file
lindenb Sep 17, 2024
2450755
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 17, 2024
62ecb77
update main.nf.test
lindenb Sep 19, 2024
2878799
Merge branch 'master' into pl_jvarkit_vcffilterjdk
lindenb Sep 20, 2024
ba8aea1
update meta.yml
lindenb Sep 20, 2024
a1ef479
remove suggestion
lindenb Sep 20, 2024
c3176e8
Merge branch 'pl_jvarkit_vcffilterjdk' of https://github.com/lindenb/…
lindenb Sep 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::jvarkit=2024.08.25"
- "bioconda:bcftools=1.20"
88 changes: 88 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
process JVARKIT_VCFFILTERJDK {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/jvarkit:2024.08.25--hdfd78af_1':
'biocontainers/jvarkit:2024.08.25--hdfd78af_1' }"

input:
tuple val(meta), path(vcf), path(tbi), path(regions_file)
tuple val(meta2), path(fasta)
tuple val(meta3), path(fai)
tuple val(meta4), path(dict)
tuple val(meta5), path(code)
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
tuple val(meta6), path(pedigree)

output:
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
tuple val(meta), path("*.${extension}"), emit: vcf
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
tuple val(meta), path("*.tbi") , emit: tbi, optional: true
tuple val(meta), path("*.csi") , emit: csi, optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args1 = task.ext.args1 ?: ''
def args2 = task.ext.args2 ?: ''
def args3 = task.ext.args3 ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def script_file = code ? "--script \"${code}\"" : ""
def pedigree_file = pedigree ? " --pedigree \"${pedigree}\" " : ""
def regions_cmd = regions_file ? (tbi ? " --regions-file" : " --targets-file") + " \"${regions_file}\" " : ""

extension = getVcfExtension(args3); /* custom function, see below */

if ("$vcf" == "${prefix}.${extension}") error "Input and output names are the same, set prefix in module configuration to disambiguate!"
"""
mkdir -p TMP

bcftools view \\
-O v \\
${regions_cmd} \\
${args1} \\
"${vcf}" |\\
jvarkit -Xmx${task.memory.giga}g -XX:-UsePerfData -Djava.io.tmpdir=TMP vcffilterjdk \\
${pedigree_file} \\
${script_file} \\
${args2} |\\
bcftools view \\
--output "${prefix}.${extension}" \\
${args3}

rm -rf TMP

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//')
jvarkit: \$(jvarkit -v)
END_VERSIONS
"""

stub:
def args3 = task.ext.args3 ?: ''
extension = getVcfExtension(args3); /* custom function, see below */
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch "${prefix}.${extension}"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//')
jvarkit: \$(jvarkit -v)
END_VERSIONS
"""
}



// Custom Function to get VCF extension
String getVcfExtension(String args) {
return args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" :
args.contains("--output-type u") || args.contains("-Ou") ? "bcf" :
args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" :
args.contains("--output-type v") || args.contains("-Ov") ? "vcf" :
"vcf";
}
118 changes: 118 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "jvarkit_vcffilterjdk"
description: Filtering VCF with dynamically-compiled java expressions
keywords:
- vcf
- bcf
- filter
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
- variant
- java
- script
tools:
- "jvarkit":
description: "Java utilities for Bioinformatics."
homepage: "https://github.com/lindenb/jvarkit"
documentation: "https://jvarkit.readthedocs.io/"
tool_dev_url: "https://github.com/lindenb/jvarkit"
doi: "10.1093/bioinformatics/btx734 "
licence: ["MIT License"]
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
args_id: "$args2"

- "bcftools":
description: |
View, subset and filter VCF or BCF files by position and filtering expression. Convert between VCF and BCF
homepage: "http://samtools.github.io/bcftools/bcftools.html"
documentation: "http://www.htslib.org/doc/bcftools.html"
doi: "10.1093/bioinformatics/btp352"
licence: ["MIT"]
args_id: ["$args1", "$args3"]
input:
- meta:
type: map
description: |
Groovy Map containing VCF information
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
e.g. [ id:'test_reference' ]
- vcf:
type: file
description: Input VCF/BCF file
pattern: "*.{vcf,bcf,vcf.gz,bcf.gz}"
- tbi:
type: file
description: Optional VCF/BCF index file
pattern: "*.{tbi,csi}"
- regions_file:
type: file
description: Optional. Restrict to regions listed in a file
pattern: "*.{bed,bed.gz,txt,tsv}"
- meta2:
type: map
description: |
Groovy Map containing fasta information
e.g. [ id:'test_reference' ]
- fasta:
type: file
description: Groovy Map containing fasta reference genome information
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have not changed it since I first commented on this - does the tool expect multiple FASTA files?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the problem your comment here. No, there is only one fasta file. Where is the confusion here ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'Groovy Map containing' is what is confusing me, it's jut the file no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
description: Groovy Map containing fasta reference genome information
description: File containing fasta reference genome information

pattern: "*.fasta"
- meta3:
type: map
description: |
Groovy Map containing fasta.fai information
e.g. [ id:'test_reference' ]
- fai:
type: file
description: Groovy Map containing fasta index information
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above... should this be a map?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... hum.. no, there is only one fai file (?..)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
description: Groovy Map containing fasta index information
description: File containing fasta index information

pattern: "*.fasta.fai"
- meta4:
type: map
description: |
Groovy Map containing fasta.dict information
e.g. [ id:'test_reference' ]
- dict:
type: file
description: Groovy Map containing reference genome information for GATK sequence dictionary
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
description: Groovy Map containing reference genome information for GATK sequence dictionary
description: File containing reference genome information for GATK sequence dictionary

pattern: "*.dict"
- meta5:
type: map
description: |
Groovy Map containing code information
e.g. [ id:'test_reference' ]
- code:
type: file
description: custom user code . May be empty if script if provided via `task.ext.args2`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
description: custom user code . May be empty if script if provided via `task.ext.args2`.
description: File containing custom user code. May be empty if script if provided via `task.ext.args2`.

pattern: "*.{code,script,txt,tsv,java,js}"
- meta6:
type: map
description: |
Groovy Map containing pedigree information
e.g. [ id:'test_reference' ]
- pedigree:
type: file
description: Optional jvarkit pedigree.
pattern: "*.{tsv,ped,pedigree}"
output:
- meta:
type: map
description: |
Groovy Map containing VCF information
e.g. [ id:'test', single_end:false ]
- vcf:
type: file
description: VCF filtered output file
pattern: "*.{vcf,bcf,vcf.gz,bcf.gz}"
- csi:
type: file
description: Default VCF file index
pattern: "*.csi"
- tbi:
type: file
description: Alternative VCF file index
pattern: "*.tbi"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@lindenb"
maintainers:
- "@lindenb"
119 changes: 119 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// nf-core modules test jvarkit/vcffilterjdk
nextflow_process {

name "Test Process JVARKIT_VCFFILTERJDK"
script "../main.nf"
process "JVARKIT_VCFFILTERJDK"
config "./nextflow.config"


tag "modules"
tag "modules_nfcore"
tag "jvarkit"
tag "jvarkit/vcffilterjdk"

test("sarscov2 - vcf") {

when {
process {
"""
input[0] =[
[id:"vcf_test"],
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true),
[],
[]
]
input[1] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ]
input[2] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) ]
input[3] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true) ]
input[4] = [ [] , []]
input[5] = [ [] , []]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(
path(process.out.vcf[0][1]).vcf.variantsMD5,
process.out.versions
).match()
}
)
}

}



test("sarscov2 - vcf+bed") {

when {
process {
"""
input[0] =[
[id:"vcf_test"],
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true),
[],
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true)
]
input[1] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ]
input[2] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) ]
input[3] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true) ]
input[4] = [ [] , []]
input[5] = [ [] , []]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert file(process.out.vcf[0][1]).exists() },
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does .variantsMD5 not work here, like you did above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BED provided in the test dataset does not overlap any variant. The output vcf is empty, When I test the variants md5, I then get an error md5sum for empty file

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK fair enough

{ assert snapshot(process.out.versions).match()
}
)
}
}




test("sarscov2 - vcf - stub") {

options "-stub"

when {
process {
"""
input[0] =[
[id:"vcf_test"],
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true),
[],
[]
]
input[1] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ]
input[2] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) ]
input[3] = [ [:] , file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true) ]
input[4] = [ [] , []]
input[5] = [ [] , []]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(
path(process.out.vcf[0][1]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For stubs I believe you can just use

Suggested change
path(process.out.vcf[0][1]),
process.out,

within the snapshot

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jfy133 that didn't work, there is an error (snap) (I keep path(process.out.vcf[0][1]),)

process.out.versions
).match()
}
)
}

}


}
45 changes: 45 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"sarscov2 - vcf": {
"content": [
"335cdc0f8c403378e1e9d75c41c3736f",
jfy133 marked this conversation as resolved.
Show resolved Hide resolved
[
"versions.yml:md5,3601751995727e2ee7102d8ef18e5304"
]
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.4"
},
"timestamp": "2024-09-03T14:00:13.118369362"
},


"sarscov2 - vcf+bed": {
"content": [
[
"versions.yml:md5,3601751995727e2ee7102d8ef18e5304"
]
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.4"
},
"timestamp": "2024-09-03T14:00:13.118369362"
},


"sarscov2 - vcf - stub": {
"content": [
"vcf_test.vcf:md5,d41d8cd98f00b204e9800998ecf8427e",
[
"versions.yml:md5,3601751995727e2ee7102d8ef18e5304"
]
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.4"
},
"timestamp": "2024-09-03T14:00:13.118369362"
}

}
5 changes: 5 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/tests/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
process {
withName: JVARKIT_VCFFILTERJDK {
ext.args2=" --expression 'return variant.getStart()%2==1;' "
}
}
2 changes: 2 additions & 0 deletions modules/nf-core/jvarkit/vcffilterjdk/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
jvarkit/vcffilterjdk:
- "modules/nf-core/jvarkit/vcffilterjdk/**"
Loading