Skip to content

Commit

Permalink
EVA-3632 - Fix validation output file renaming in SV and naming conve…
Browse files Browse the repository at this point in the history
…ntion (#215)

* Fix validation output file renaming
* Report the naming convention map when none of the chromosome are resolved
  • Loading branch information
tcezard authored Aug 6, 2024
1 parent fafb09e commit 5e669ac
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 39 deletions.
36 changes: 19 additions & 17 deletions eva_submission/eload_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def mark_valid_files_and_metadata(self, merge_per_analysis):
def _get_vcf_files(self):
vcf_files = []
for analysis_alias in self.eload_cfg.query('submission', 'analyses'):
files = self.eload_cfg.query('submission', 'analyses', self._unique_alias(analysis_alias), 'vcf_files')
files = self.eload_cfg.query('submission', 'analyses', analysis_alias, 'vcf_files')
vcf_files.extend(files) if files else None
return vcf_files

Expand Down Expand Up @@ -224,14 +224,17 @@ def parse_vcf_check_report(self, vcf_check_report):

def parse_sv_check_log(self, sv_check_log):
with open(sv_check_log) as open_file:
nb_sv = int(open_file.readline().split()[0])
return nb_sv
nb_sv = open_file.readline().split()
if nb_sv:
return int(nb_sv[0])
else:
return 0

def _generate_csv_mappings(self):
vcf_files_mapping_csv = os.path.join(self.eload_dir, 'validation_vcf_files_mapping.csv')
with open(vcf_files_mapping_csv, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['vcf', 'fasta', 'report', 'assembly'])
writer.writerow(['vcf', 'fasta', 'report', 'assembly_accession'])
analyses = self.eload_cfg.query('submission', 'analyses')
for analysis_alias, analysis_data in analyses.items():
fasta = analysis_data['assembly_fasta']
Expand Down Expand Up @@ -294,7 +297,6 @@ def _collect_validation_workflow_results(self, output_dir, validation_tasks):
if 'naming_convention_check' in validation_tasks:
self._collect_naming_convention_check_results(vcf_files, output_dir)


def _collect_vcf_check_results(self, vcf_files, output_dir):
total_error = 0
# detect output files for vcf check
Expand Down Expand Up @@ -388,23 +390,22 @@ def _collect_assembly_check_results(self, vcf_files, output_dir):
def _collect_structural_variant_check_results(self, vcf_files, output_dir):
# detect output files for structural variant check
for vcf_file in vcf_files:
vcf_name = os.path.basename(vcf_file)
vcf_name, ext = os.path.splitext(os.path.basename(vcf_file))

tmp_sv_check_log = resolve_single_file_path(
os.path.join(output_dir, 'sv_check', vcf_name + '_sv_check.log')
)
tmp_sv_check_sv_vcf = resolve_single_file_path(
os.path.join(output_dir, 'sv_check', vcf_name + '_sv_list.vcf')
os.path.join(output_dir, 'sv_check', vcf_name + '_sv_list.vcf.gz')
)

# move the output files
sv_check_log = self._move_file(
tmp_sv_check_log,
os.path.join(self._get_dir('sv_check'), vcf_name + '_sv_check.log')
)
sv_check_sv_vcf = self._move_file(
tmp_sv_check_sv_vcf,
os.path.join(self._get_dir('sv_check'), vcf_name + '_sv_list.vcf')
os.path.join(self._get_dir('sv_check'), vcf_name + '_sv_list.vcf.gz')
)

if sv_check_log and sv_check_sv_vcf:
Expand All @@ -416,8 +417,7 @@ def _collect_structural_variant_check_results(self, vcf_files, output_dir):
def _collect_naming_convention_check_results(self, vcf_files, output_dir):
naming_conventions = set()
for vcf_file in vcf_files:
vcf_name = os.path.basename(vcf_file)

vcf_name, ext = os.path.splitext(os.path.basename(vcf_file))
tmp_nc_check_yml = resolve_single_file_path(
os.path.join(output_dir, 'naming_convention_check', vcf_name + '_naming_convention.yml')
)
Expand All @@ -430,8 +430,8 @@ def _collect_naming_convention_check_results(self, vcf_files, output_dir):
with open(nc_check_yml) as open_yaml:
data = yaml.safe_load(open_yaml)
self.eload_cfg.set('validation', 'naming_convention_check', 'files', os.path.basename(vcf_file),
value=data)
naming_conventions.add(data['naming_convention'])
value=data[0])
naming_conventions.add(data[0]['naming_convention'])
if len(naming_conventions) == 1:
self.eload_cfg.set('validation', 'naming_convention_check', 'naming_convention',
value=naming_conventions.pop())
Expand Down Expand Up @@ -551,15 +551,17 @@ def _structural_variant_check_report(self):
return '\n'.join(reports)

def _naming_convention_check_report(self):
nc_list = self.eload_cfg.query('validation', 'naming_convention_check', 'files')
vcf_files_2_naming_conv = self.eload_cfg.query('validation', 'naming_convention_check', 'files')
reports = []
if nc_list:
if vcf_files_2_naming_conv:
reports.append(
f" * Naming convention: "
f"{self.eload_cfg.query('validation', 'naming_convention_check', 'naming_convention')}"
)
for nc_dict in nc_list:
reports.append(f" * {nc_dict['vcf_file']}: {nc_dict['naming_convention']}")
for vcf_file in vcf_files_2_naming_conv:
reports.append(f" * {vcf_file}: {vcf_files_2_naming_conv[vcf_file]['naming_convention']}")
if not vcf_files_2_naming_conv[vcf_file]['naming_convention']:
reports.append(f" * {vcf_file}: {vcf_files_2_naming_conv[vcf_file]['naming_convention_map']}")
return '\n'.join(reports)

def report(self):
Expand Down
15 changes: 7 additions & 8 deletions eva_submission/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,14 @@ process detect_sv {
export PYTHONPATH="$params.executable.python.script_path"
$params.executable.python.interpreter -m eva_submission.steps.structural_variant_detection \
--vcf_file $vcf_file --output_vcf_file_with_sv sv_check/${vcf_file.getSimpleName()}_sv_list.vcf \
> sv_check/${vcf_file.getSimpleName()}_sv_check.log 2>&1
$params.executable.bgzip -c sv_check/${vcf_file.getSimpleName()}_sv_list.vcf > sv_check/${vcf_file.getSimpleName()}_sv_list.vcf.gz
rm sv_check/${vcf_file.getSimpleName()}_sv_list.vcf
--vcf_file $vcf_file --output_vcf_file_with_sv sv_check/${vcf_file.getBaseName()}_sv_list.vcf \
> sv_check/${vcf_file.getBaseName()}_sv_check.log 2>&1
$params.executable.bgzip -c sv_check/${vcf_file.getBaseName()}_sv_list.vcf > sv_check/${vcf_file.getBaseName()}_sv_list.vcf.gz
rm sv_check/${vcf_file.getBaseName()}_sv_list.vcf
"""
}



/*
* Detect the naming convention in VCF
*/
Expand All @@ -155,7 +154,7 @@ process detect_naming_convention {
mode: "copy"

input:
tuple path(vcf_file), accession
tuple path(vcf_file), val(accession)

output:
path "naming_convention_check/*_naming_convention.yml", emit: nc_check_yml
Expand All @@ -165,7 +164,7 @@ process detect_naming_convention {
mkdir -p naming_convention_check
export PYTHONPATH="$params.executable.python.script_path"
$params.executable.python.interpreter -m eva_submission.steps.detect_contigs_naming_convention.py \
--vcf_files $vcf_file --assembly_accession $accession --output_yaml naming_convention_check/${vcf_file.getSimpleName()}_naming_convention.yml
$params.executable.python.interpreter -m eva_submission.steps.detect_contigs_naming_convention \
--vcf_files $vcf_file --assembly_accession $accession --output_yaml naming_convention_check/${vcf_file.getBaseName()}_naming_convention.yml
"""
}
8 changes: 4 additions & 4 deletions eva_submission/steps/detect_contigs_naming_convention.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, assembly_accession):

def naming_convention_map_for_vcf(self, input_vcf):
"""Provides a set of contigs names present in the VCF file for each compatible naming convention"""
naming_convention_map = defaultdict(list)
naming_convention_map = defaultdict(set)
if input_vcf.endswith('.gz'):
vcf_in = gzip.open(input_vcf, mode="rt")
else:
Expand All @@ -50,8 +50,8 @@ def naming_convention_map_for_vcf(self, input_vcf):
if line.startswith("#"):
continue
contig_name = line.split('\t')[0]
naming_convention_map[self.get_contig_convention(contig_name)].append(contig_name)
return dict(naming_convention_map)
naming_convention_map[self.get_contig_convention(contig_name)].add(contig_name)
return dict((nc, list(sorted(set_contig))) for nc, set_contig in naming_convention_map.items())

@cached_property
def _contig_conventions_map(self):
Expand All @@ -76,7 +76,7 @@ def write_convention_map_to_yaml(self, vcf_files, output_yaml):
results = []
for input_vcf in vcf_files:
naming_convention_to_contigs = self.naming_convention_map_for_vcf(input_vcf)
if len(naming_convention_to_contigs) == 1:
if len(naming_convention_to_contigs) == 1 and 'Not found' not in naming_convention_to_contigs:
naming_convention = list(naming_convention_to_contigs)[0]
naming_convention_map = None
else:
Expand Down
5 changes: 1 addition & 4 deletions eva_submission/steps/structural_variant_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def detect_structural_variant(vcf_file, output_vcf):
ctx = gzip.open(vcf_file, mode="rt")
else:
ctx = open(vcf_file, mode="r")
has_sv_in_vcf = False
with ctx as open_input, open(output_vcf, 'w') as open_output:
for line in open_input:
if line.startswith("#"):
Expand All @@ -33,10 +32,8 @@ def detect_structural_variant(vcf_file, output_vcf):
if re.search(sv_regex, alternate_allele):
open_output.write(line)
nb_sv += 1
has_sv_in_vcf = True
break
if has_sv_in_vcf:
print(f'{nb_sv} lines containing structural variants')
print(f'{nb_sv} lines containing structural variants')


def main():
Expand Down
4 changes: 2 additions & 2 deletions tests/nextflow-tests/run_tests_validation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ cd ${SCRIPT_DIR}
printf "\e[32m===== VALIDATION PIPELINE =====\e[0m\n"
nextflow run "${SOURCE_DIR}/validation.nf" -params-file test_validation_config.yaml

ls output/sv_check/test1_sv_check.log \
output/sv_check/test1_sv_list.vcf.gz \
ls output/sv_check/test1.vcf_sv_check.log \
output/sv_check/test1.vcf_sv_list.vcf.gz \
output/assembly_check/test1.vcf.gz.assembly_check.log \
output/assembly_check/test1.vcf.gz.text_assembly_report \
output/assembly_check/test1.vcf.gz.valid_assembly_report
Expand Down
7 changes: 3 additions & 4 deletions tests/resources/eloads/ELOAD_2/.ELOAD_2_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ validation:
pass: true
naming_convention: enaSequenceName
files:
- {
vcf_file: test.vcf,
naming_convention: enaSequenceName
}
test.vcf:
vcf_file: test.vcf
naming_convention: enaSequenceName

0 comments on commit 5e669ac

Please sign in to comment.