- Change static data structure in config to separate global and array…

… specific files - move all default global files to the cache dir - allow multiple array definitions - Add Array_Name column to sampletable - Add xlsx support for sampletable
bihealth · Oct 21, 2024 · 08e351f · 08e351f
1 parent 3dd881d
commit 08e351f
Show file tree

Hide file tree

Showing 38 changed files with 1,267 additions and 825 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ For now, only installation 'from source' is possible:
 
 1. Clone this git repository
 2. *optional, but recommended* Create a new enviroment, i.e. conda create -n stemcnv-check python=3.12, then activate it
+   - Note: on some systems like WSL you may also need: `apptainer` and `gcc_linux-64` (<14 for recent datrie issue)
 3. Install dependencies and the stemcnv-check script using pip `pip install -e .`. For development, use `pip install -e .[all]`
 4. All further runtime dependencies (conda environments and docker containers) will be pulled automatically by snakemake when running the analysis
 
@@ -23,12 +24,12 @@ For now, only installation 'from source' is possible:
 StemCNV-check requires a sample table and a config file to run. Example files can be created using `stemcnv-check setup-files`.
 
 The sample table (default: sample_table.tsv) is a tab-separated file describing all samples to be analyzed:
-- Required columns: Sample_ID, Chip_Name, Chip_Pos, Sex, Reference_Sample
+- Required columns: Sample_ID, Chip_Name, Chip_Pos, Array_Name, Sex, Reference_Sample
 - Optional columns (reserved): Sample_Name, Regions_of_Interest
 - See the `sample_table_example.tsv` file (of the sample_table.tsv created bye the setup-files command) for a description of individual columns
 
 The config file (default: config.yaml) defines all settings for the analysis and inherits from the inbuilt default.  
-Required settings that are not defined by default include static files specific to the used array platform and genome build:
+Required settings that are not defined by default include array definition files specific to the used array platform and genome build:
 - egt_cluster_file: the illumina cluster file (.egt) for the array platform, available from Illumina or the provider running the array 
 - bpm_manifest_file: the beadpool manifest file (.bpm) for the array platform, available from Illumina or the provider running the array
 - csv_manifest_file (optional): the manifest file in csv format, available from Illumina or the provider running the array
@@ -42,7 +43,7 @@ Additionally, the config file needs to define the following paths:
 ## Usage
 
 Before the first analysis sample table and config file need to be set up (see above). 
-Unless otherwise specified, stemcnv-check defaults to look for a "sample_table.tsv" and "config.yaml" file.  
+Unless otherwise specified, stemcnv-check defaults to look for a "sample_table.tsv" (or .xlsx) and "config.yaml" file.  
 
 Automatic generation of the additional array & genome-build specific static files can only be done if sample data for 
 that array is available.  
@@ -77,8 +78,6 @@ Run the example data:
 StemCNV-check will produce the following output files for each sample, when run with default settings:
 - `data_path/{sample}/{sample}.annotated-SNP-data.{filter}-filter.vcf.gz`  
   The filtered, processed and annotated SNP data of the array in vcf format
-- `data_path/{sample}/{sample}.stats.txt`  
-  The CNV calls for the sample GenCall stat
 - `data_path/{sample}/{sample}.CNV_calls.CBS.vcf.gz`  
   The CNV calls for the sample from the CBS (Circular Binary Segmentation) algorithm in vcf format
 - `data_path/{sample}/{sample}.CNV_calls.PennCNV.vcf.gz`  

diff --git a/example_data/config.yaml b/example_data/config.yaml
@@ -1,16 +1,13 @@
-static_data:
-    bpm_manifest_file: 'static-data/GSAMD-24v3-0-EA_20034606_A1.bpm'
-    egt_cluster_file: 'static-data/GSAMD-24v3-0-EA_20034606_A1.egt'
-    csv_manifest_file: 'static-data/GSAMD-24v3-0-EA_20034606_A1.csv.gz'
-    genome_gtf_file: static-data/gencode.hg19.v45.gtf.gz
-    penncnv_GCmodel_file: static-data/PennCNV-GCmodel_hg19_ExampleArray.gcmodel
-    penncnv_pfb_file: static-data/PennCNV-PFB_hg19_ExampleArray.pfb
-    array_density_file: static-data/density_hg19_ExampleArray.bed
-    array_gaps_file: static-data/gaps_hg19_ExampleArray.bed
-    genomeInfo_file: static-data/UCSC_hg19_chromosome-info.tsv
-
-genome_version: 'hg19'
-array_name: 'ExampleArray'
+array_definition:
+  ExampleArray:
+    genome_version: 'hg19'
+    bpm_manifest_file: 'static-data/ExampleArray/GSAMD-24v3-0-EA_20034606_A1.bpm'
+    egt_cluster_file: 'static-data/ExampleArray/GSAMD-24v3-0-EA_20034606_A1.egt'
+    csv_manifest_file: 'static-data/ExampleArray/GSAMD-24v3-0-EA_20034606_A1.csv.gz'
+    penncnv_GCmodel_file: static-data/ExampleArray/PennCNV-GCmodel_hg19_ExampleArray.gcmodel
+    penncnv_pfb_file: static-data/ExampleArray/PennCNV-PFB_hg19_ExampleArray.pfb
+    array_density_file: static-data/ExampleArray/density_hg19_ExampleArray.bed
+    array_gaps_file: static-data/ExampleArray/gaps_hg19_ExampleArray.bed
 
 raw_data_folder: 'RAW'
 data_path: data

diff --git a/example_data/sample_table.tsv b/example_data/sample_table.tsv
@@ -1,7 +1,7 @@
-Sample_Name	Chip_Name	Chip_Pos	Sample_ID	Sex	Reference_Sample
-HG001/NA12878	207521920117	R09C02	HG001	female	
-HG002/NA24385	207521920117	R05C02	HG002	male	
-HG004/NA24143	207521920117	R07C02	HG004	female	
-HG005/NA24631	207521920117	R01C02	HG005	male	HG006
-HG006/NA24694	207521920117	R03C02	HG006	male	
-HG007/NA24695	207521920117	R11C02	HG007	female	
+Sample_ID	Chip_Name	Chip_Pos	Array_Name	Sex	Reference_Sample	Sample_Name
+HG001	207521920117	R09C02	ExampleArray	female		HG001/NA12878
+HG002	207521920117	R05C02	ExampleArray	male		HG002/NA24385
+HG004	207521920117	R07C02	ExampleArray	female		HG004/NA24143
+HG005	207521920117	R01C02	ExampleArray	male	HG006	HG005/NA24631
+HG006	207521920117	R03C02	ExampleArray	male		HG006/NA24694
+HG007	207521920117	R11C02	ExampleArray	female		HG007/NA24695
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
     "loguru ~=0.7.2",
     # native excel & tsv support
     "pandas ~=2.2",
+    "XlsxWriter ~=3.2",
     # Config validation & comparison
     "pydantic ~=2.8"  , 
     "deepdiff ~=8.0"

diff --git a/stemcnv_check/__main__.py b/stemcnv_check/__main__.py
@@ -31,38 +31,48 @@ def setup_argparse():
     group_basic = parser.add_argument_group("General", "General pipeline arguments")
 
     group_basic.add_argument('--config', '-c', default='config.yaml', help="Filename of config file. Default: %(default)s")
-    group_basic.add_argument('--sample-table', '-s', default='sample_table.tsv', help="Filename of sample table. Default: %(default)s")
+    group_basic.add_argument('--sample-table', '-s', default=None,
+                             help="Filename of sample table, can be tsv or xlsx format (1st sheet is read). "
+                                  "Default: sample_table.tsv or sample_table.xlsx")
+    group_basic.add_argument('--column-remove-regex', nargs='?', const=r'\s.*', type=str,
+                             help="Regex to remove text from sample table column names (before looking for required columns)."
+                                  " Not used by default, default if not regex given ' .*' (remove spaces and everything following a space)")
+    # group_basic.add_argument('--required-col-indices', default=None, nargs=6,
+    #                             help="Indexes of the required columns in the sample table. "
+    #                                  "Required columns are: Sample_ID, Chip_Name, Chip_Pos, Array_Name, Sex, Reference_Sample. "
+    #                                  "This will set the corresponding names to the specified columns")
     group_basic.add_argument('--directory', '-d', default=None,
                              help="Directory to run pipeline in. Default: current directory")
     group_basic.add_argument('--verbose', '-v', action='count', default=0,
                              help="More verbose output, maximum verbosity at -vv")
 
     group_setupfiles = parser.add_argument_group("setup-files", "Details for setup-files")
     group_setupfiles.add_argument('--config-details', default='minimal', choices=('minimal', 'medium', 'advanced', 'complete'), help="Level of detail for the config file. Default: %(default)s")
+    group_setupfiles.add_argument('--sampletable-format', default='tsv', choices=('tsv', 'xlsx'), help="Format of the sample table. Default: %(default)s")
     group_setupfiles.add_argument('--overwrite', action='store_true', help="Allow overwriting of existing files")
 
     group_static = parser.add_argument_group("make-staticdata", "Details and file naming for make-staticdata")
-    group_static.add_argument('--edit-config-inplace', action='store_true', help = "Edit the config file in place with updated static-data entries")
-    group_static.add_argument('--penncnv-pfb-file', default='static-data/PennCNV-PFB_{genome}_{array}.pfb',
-                               help="Filename for generated PFB file. Default: %(default)s")
-    group_static.add_argument('--penncnv-gcmodel-file', default='static-data/PennCNV-GCmodel_{genome}_{array}.gcmodel',
-                               help="Filename for generated GCmodel file. Default: %(default)s")
-    group_static.add_argument('--array-density-file', default='static-data/density_{genome}_{array}.bed',
-                               help="Filename for generated bed file with probe density. Default: %(default)s")
-    group_static.add_argument('--array-gaps-file', default='static-data/gaps_{genome}_{array}.bed',
-                              help="Filename for generated bed file with probe gaps. Default: %(default)s")
-    group_static.add_argument('--genomeinfo-file', default='static-data/UCSC_{genome}_chromosome-info.tsv',
-                               help="Filename for generated chromosome info file. Default: %(default)s")
-    group_static.add_argument('--genome-gtf-file', default='static-data/gencode.{genome}.v45.gtf.gz',
-                               help="Filename for generated chromosome info file. Default: %(default)s")
+    group_static.add_argument('--no-edit-inplace', action='store_true', help = "Do not edit the config file in place with updated static-data entries")
+    # group_static.add_argument('--penncnv-pfb-file', default='static-data/PennCNV-PFB_{genome}_{array}.pfb',
+    #                            help="Filename for generated PFB file. Default: %(default)s")
+    # group_static.add_argument('--penncnv-gcmodel-file', default='static-data/PennCNV-GCmodel_{genome}_{array}.gcmodel',
+    #                            help="Filename for generated GCmodel file. Default: %(default)s")
+    # group_static.add_argument('--array-density-file', default='static-data/density_{genome}_{array}.bed',
+    #                            help="Filename for generated bed file with probe density. Default: %(default)s")
+    # group_static.add_argument('--array-gaps-file', default='static-data/gaps_{genome}_{array}.bed',
+    #                           help="Filename for generated bed file with probe gaps. Default: %(default)s")
+    # group_static.add_argument('--genomeinfo-file', default='static-data/UCSC_{genome}_chromosome-info.tsv',
+    #                            help="Filename for generated chromosome info file. Default: %(default)s")
+    # group_static.add_argument('--genome-gtf-file', default='static-data/gencode.{genome}.v45.gtf.gz',
+    #                            help="Filename for generated chromosome info file. Default: %(default)s")
 
     group_snake = parser.add_argument_group("Snakemake Settings", "Arguments for Snakemake (also affects make-staticdata)")
     group_snake.add_argument('--cache-path', default=None,
-                             help="Override auto-selection of the cache path to a specific directory."
+                             help="Override auto-selection of the cache path to a specific directory. The default cache path is defined in the conifg file."
                              )
     group_snake.add_argument('--no-cache', action='store_true',
-                             help="Do not use the a chache directory. The cache is used for workflow created metadata "
-                             "(conda envs, singularity images, and VEP data). The default cache path is ~/.")
+                             help="Do not use a chache directory. The cache is used for workflow created metadata "
+                             "(conda envs, singularity images, and VEP data). The default cache path is defined in the conifg file.")
 
     group_snake.add_argument('--target', '-t', default='complete',
                              choices=('complete', 'report', 'summary-tables', 'combined-cnv-calls', 'PennCNV', 'CBS', 'SNP-data'),
@@ -90,13 +100,25 @@ def main(argv=None):
                 diagnose=args.verbose > 1,
                 )
 
+    if args.sample_table is None and args.action != 'setup-files':
+        if os.path.isfile('sample_table.tsv'):
+            args.sample_table = 'sample_table.tsv'
+        elif os.path.isfile('sample_table.xlsx'):
+            args.sample_table = 'sample_table.xlsx'
+        else:
+            logging.error("No default sample table found (sample_table.tsv or sample_table.xlsx). "
+                          "Please create a sample table (i.e. stemcnv-check setup-files) or specify one with --sample-table")
+            raise FileNotFoundError("No sample table found")
+
     if args.action == 'run':
-        check_sample_table(args.sample_table, args.config)
-        check_config(args.config, args.sample_table)
+        check_sample_table(args.sample_table, args.config, args.column_remove_regex)
+        check_config(args.config, args.sample_table, args.column_remove_regex)
         if args.directory is not None and not os.path.isdir(args.directory):
             os.makedirs(args.directory)
         ret = run_stemcnv_check_workflow(args)
     elif args.action == 'setup-files':
+        if not args.sample_table:
+            args.sample_table = 'sample_table.tsv' if args.sampletable_format == 'tsv' else 'sample_table.xlsx'
         ret = setup_control_files(args)
     elif args.action == 'make-staticdata':
         if args.directory is not None and not os.path.isdir(args.directory):