Skip to content

Commit

Permalink
add sv_vcf option for GroupedAnalysis
Browse files Browse the repository at this point in the history
  • Loading branch information
jluebeck committed Nov 26, 2024
1 parent a3ab956 commit 8d0df6c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
25 changes: 22 additions & 3 deletions GroupedAnalysisAmpSuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def create_AA_AC_cmds(tumor_lines, base_argstring, grouped_seeds, parent_odir):
curr_argstring = "{} --run_AA --run_AC -s {} --bam {} --bed {} -o {}".format(base_argstring, tf[0], tf[1],
curr_seeds, odir)

optionals = zip(["--sample_metadata", ], tf[4:])
optionals = zip(["--sample_metadata", "--sv_vcf"], tf[4:])
for k, v in optionals:
if v:
curr_argstring += " {} {}".format(k, v)
Expand Down Expand Up @@ -144,7 +144,7 @@ def create_CNV_cmds(tumor_lines, normal_lines, base_argstring, cnvkit_dir, paren
if normalbam:
curr_argstring += " --normal_bam {}".format(normalbam[1])

optionals = zip(["--cnv_bed", "--sample_metadata"], tf[3:])
optionals = zip(["--cnv_bed", "--sample_metadata", "--sv_vcf"], tf[3:])
for k, v in optionals:
if v:
curr_argstring += " {} {}".format(k, v)
Expand Down Expand Up @@ -188,8 +188,9 @@ def read_group_data(input_file):
sample_name bam_file sample_type
where 'sample_type' is either 'tumor' or 'normal'
additional optional fields are as follows:
cnv_calls sample_metadata_json
cnv_calls sample_metadata_json sv_calls
"""
data_len = 6
tumor_lines = []
normal_lines = []
seen_names = set ()
Expand All @@ -211,6 +212,9 @@ def read_group_data(input_file):
if v.upper() == "NA" or v.upper() == "NONE" or v.upper() == "":
fields[ind] = None

if len(fields) < data_len:
fields.extend([None] * (data_len - len(fields)))

if fields[2].lower() == "tumor":
tumor_lines.append(fields)

Expand All @@ -222,6 +226,21 @@ def read_group_data(input_file):
"group input formatting instructions.\n\n")
sys.exit(1)

if fields[3] and not any(fields[3].endswith(x) for x in ['.bed', '.cns']):
sys.stderr.write("Input formatting error! Column 4 (CNV calls) must either be 'NA' or a .bed or .cns file.\nSee README for "
"group input formatting instructions.\n\n")
sys.exit(1)

elif fields[4] and not fields[4].endswith('.json'):
sys.stderr.write("Input formatting error! Column 5 (Sample metadata json) must either be 'NA' or a .json file.\nSee README for "
"group input formatting instructions.\n\n")
sys.exit(1)

elif fields[5] and not fields[5].endswith('.vcf'):
sys.stderr.write("Input formatting error! Column 6 (external SV calls) must either be 'NA' or a .vcf file.\nSee README for "
"group input formatting instructions.\n\n")
sys.exit(1)

if fields[0] in seen_names:
sys.stderr.write("Duplicate sample name {} in .input file! Sample names must be unique.\n".format(fields[0]))
sys.exit(1)
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,10 @@ Otherwise, you will instead need these arguments below:
- `--samtools_path`: Path to a specific samtools binary for use (e.g., /path/to/my/samtools). Uses samtools on system path by default.
- `--sv_vcf`: Provide a VCF file of externally-called SVs to augment SVs identified by AA internally.
- `--sv_vcf_no_filter`: Use all external SV calls from the --sv_vcf arg, even those without 'PASS' in the FILTER column.
## Interpreting classification outputs
- Information about the amplicon classification files produced at the end of the workflow are available [here](https://github.com/AmpliconSuite/AmpliconClassifier?tab=readme-ov-file#3-outputs).
Expand Down Expand Up @@ -319,9 +323,9 @@ For samples derived from a common origin (longitudinal, multiregional sampling f
between runs. We provide a script `GroupedAnalysisAmpSuite.py` which automates this analysis. `GroupedAnalysisAmpSuite.py` takes almost all the same arguments as `PrepareAA.py`,
however it requires an additional input file, listing the inputs. This file is to be formatted as follows
`sample_name` `bamfile` `"tumor"/"normal"` `[CNV_calls]` `[sample_metadata_json]`
`sample_name` `bamfile` `"tumor"|"normal"` `[CNV_calls.bed]` `[sample_metadata.json]` `[SV_calls.vcf]`
Where `CNV_calls` and `sample_metadata_json` are optional. All samples listed in each file should be uniquely named and from the same group of related samples. Do not include different collections of related samples in the same table - make different tables. However, they are positional, so if `CNV_calls` is skipped, it should be set as either `NA` or `None`.
Where `CNV_calls.bed`, `sample_metadata.json`, `SV_calls.vcf` are all optional. All samples listed in each file should be uniquely named and from the same group of related samples. Do not include different collections of related samples in the same table - make different tables. However, they are positional, so if `CNV_calls` is skipped, it should be set as either `NA` or `None`.
AA and AC will be run by default, but can be disabled with `--no_AA`.
Expand Down

0 comments on commit 8d0df6c

Please sign in to comment.