add sv_vcf option for GroupedAnalysis

AmpliconSuite · Nov 26, 2024 · 8d0df6c · 8d0df6c
1 parent a3ab956
commit 8d0df6c
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 5 deletions.
diff --git a/GroupedAnalysisAmpSuite.py b/GroupedAnalysisAmpSuite.py
@@ -116,7 +116,7 @@ def create_AA_AC_cmds(tumor_lines, base_argstring, grouped_seeds, parent_odir):
         curr_argstring = "{} --run_AA --run_AC -s {} --bam {} --bed {} -o {}".format(base_argstring, tf[0], tf[1],
                                                                                curr_seeds, odir)
 
-        optionals = zip(["--sample_metadata", ], tf[4:])
+        optionals = zip(["--sample_metadata", "--sv_vcf"], tf[4:])
         for k, v in optionals:
             if v:
                 curr_argstring += " {} {}".format(k, v)
@@ -144,7 +144,7 @@ def create_CNV_cmds(tumor_lines, normal_lines, base_argstring, cnvkit_dir, paren
         if normalbam:
             curr_argstring += " --normal_bam {}".format(normalbam[1])
 
-        optionals = zip(["--cnv_bed", "--sample_metadata"], tf[3:])
+        optionals = zip(["--cnv_bed", "--sample_metadata", "--sv_vcf"], tf[3:])
         for k, v in optionals:
             if v:
                 curr_argstring += " {} {}".format(k, v)
@@ -188,8 +188,9 @@ def read_group_data(input_file):
     sample_name  bam_file  sample_type
     where 'sample_type' is either 'tumor' or 'normal'
     additional optional fields are as follows:
-    cnv_calls  sample_metadata_json
+    cnv_calls  sample_metadata_json  sv_calls
     """
+    data_len = 6
     tumor_lines = []
     normal_lines = []
     seen_names = set ()
@@ -211,6 +212,9 @@ def read_group_data(input_file):
                 if v.upper() == "NA" or v.upper() == "NONE" or v.upper() == "":
                     fields[ind] = None
 
+            if len(fields) < data_len:
+                fields.extend([None] * (data_len - len(fields)))
+
             if fields[2].lower() == "tumor":
                 tumor_lines.append(fields)
 
@@ -222,6 +226,21 @@ def read_group_data(input_file):
                                  "group input formatting instructions.\n\n")
                 sys.exit(1)
 
+            if fields[3] and not any(fields[3].endswith(x) for x in ['.bed', '.cns']):
+                sys.stderr.write("Input formatting error! Column 4 (CNV calls) must either be 'NA' or a .bed or .cns file.\nSee README for "
+                                 "group input formatting instructions.\n\n")
+                sys.exit(1)
+
+            elif fields[4] and not fields[4].endswith('.json'):
+                sys.stderr.write("Input formatting error! Column 5 (Sample metadata json) must either be 'NA' or a .json file.\nSee README for "
+                                 "group input formatting instructions.\n\n")
+                sys.exit(1)
+
+            elif fields[5] and not fields[5].endswith('.vcf'):
+                sys.stderr.write("Input formatting error! Column 6 (external SV calls) must either be 'NA' or a .vcf file.\nSee README for "
+                                 "group input formatting instructions.\n\n")
+                sys.exit(1)
+
             if fields[0] in seen_names:
                 sys.stderr.write("Duplicate sample name {} in .input file! Sample names must be unique.\n".format(fields[0]))
                 sys.exit(1)

diff --git a/README.md b/README.md
@@ -281,6 +281,10 @@ Otherwise, you will instead need these arguments below:
 
 - `--samtools_path`: Path to a specific samtools binary for use (e.g., /path/to/my/samtools). Uses samtools on system path by default.
 
+- `--sv_vcf`: Provide a VCF file of externally-called SVs to augment SVs identified by AA internally.
+
+- `--sv_vcf_no_filter`: Use all external SV calls from the --sv_vcf arg, even those without 'PASS' in the FILTER column. 
+
 
 ## Interpreting classification outputs
 - Information about the amplicon classification files produced at the end of the workflow are available [here](https://github.com/AmpliconSuite/AmpliconClassifier?tab=readme-ov-file#3-outputs).
@@ -319,9 +323,9 @@ For samples derived from a common origin (longitudinal, multiregional sampling f
 between runs. We provide a script `GroupedAnalysisAmpSuite.py` which automates this analysis. `GroupedAnalysisAmpSuite.py` takes almost all the same arguments as `PrepareAA.py`, 
 however it requires an additional input file, listing the inputs. This file is to be formatted as follows
 
-`sample_name` `bamfile` `"tumor"/"normal"` `[CNV_calls]` `[sample_metadata_json]`
+`sample_name` `bamfile` `"tumor"|"normal"` `[CNV_calls.bed]` `[sample_metadata.json]` `[SV_calls.vcf]`
 
-Where `CNV_calls` and `sample_metadata_json` are optional. All samples listed in each file should be uniquely named and from the same group of related samples. Do not include different collections of related samples in the same table - make different tables. However, they are positional, so if `CNV_calls` is skipped, it should be set as either `NA` or `None`.
+Where `CNV_calls.bed`, `sample_metadata.json`, `SV_calls.vcf` are all optional. All samples listed in each file should be uniquely named and from the same group of related samples. Do not include different collections of related samples in the same table - make different tables. However, they are positional, so if `CNV_calls` is skipped, it should be set as either `NA` or `None`.
 
 AA and AC will be run by default, but can be disabled with `--no_AA`.