Skip to content

Commit

Permalink
feat: working bclconvert rule
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Routsong committed Dec 6, 2023
1 parent 01abf1f commit fb080cf
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 45 deletions.
2 changes: 1 addition & 1 deletion scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def get_resource_config():

def base_config(keys=None, qc=True):
base_keys = ('runs', 'run_ids', 'project', 'rnums', 'bcl_files', \
'sample_sheet', 'samples', 'sids', 'out_to', 'demux_input_dir')
'sample_sheet', 'samples', 'sids', 'out_to', 'demux_input_dir', 'bclconvert')
this_config = {k: [] for k in base_keys}
this_config['resources'] = get_resource_config()
this_config['runqc'] = qc
Expand Down
11 changes: 9 additions & 2 deletions scripts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,15 @@ def is_dir_staged(server, run_dir):
"""
filter check for wheter or not a directory has the appropriate breadcrumbs or not
RTAComplete.txt - file transfer from instrument breadcrumb, CSV file with values:
Run Date, Run time, Instrument ID
CopyComplete.txt - file transfer from instrument breadcrumb, blank (won't be there on instruments != NextSeq2k)
RTAComplete.txt - sequencing breadcrumb, CSV file with values:
Run Date, Run time, Instrument ID
RunInfo.xml - XML metainformation
Fastq Markers for DRAGEN demultiplexing
<run directory>/Analysis/*/Data/fastq/*.fastq.gz
"""
analyzed_checks = [
Path(run_dir, 'RTAComplete.txt').exists(),
Expand Down
38 changes: 25 additions & 13 deletions scripts/samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from csv import DictReader
from io import StringIO
from pathlib import Path
from datetime import datetime
from operator import itemgetter
from dateutil import parser as dateparser
Expand All @@ -22,6 +23,7 @@ class IllumniaSampleSheet():
"""
def __init__(self, samplesheet, end=None):
self.path = Path(samplesheet).absolute()
self.sheet = self.parse_sheet(samplesheet)
self.force_endedness = end

Expand All @@ -47,14 +49,19 @@ def parse_sheet(self, sheet):
if 'Reads' in sheet_sections:
self.process_simple_section(sheet_sections['Reads'])

if 'BCLConvert_Settings' in sheet_sections:
norm_names = {'AdapterRead1': 'Read01', 'AdapterRead2': 'Read02'}
self.process_simple_section(sheet_sections['BCLConvert_Settings'], rename=norm_names)

if 'Data' not in sheet_sections and 'BCLConvert_Data' in sheet_sections:
sheet_sections['Data'] = sheet_sections['BCLConvert_Data']

assert 'Data' in sheet_sections, 'No sample data within this sample sheet'
self.process_csv_section(sheet_sections['Data'])
filtered_data = list(filter(lambda x: len(set(x)) > 1, sheet_sections['Data']))
self.process_csv_section(filtered_data)


def process_simple_section(self, section):
def process_simple_section(self, section, rename=None):
"""Simple section processing for Illumnia sample sheet.
Objective:
Expand All @@ -76,6 +83,8 @@ def process_simple_section(self, section):
if index == 'Date':
setattr(self, index, dateparser.parse(second))
else:
if rename and index in rename:
index = rename[index]
setattr(self, index, second)
return

Expand All @@ -99,17 +108,24 @@ def project(self):
def instrument(self):
return getattr(self, 'Instrument', None)

@property
def platform(self):
return getattr(self, 'InstrumentPlatform', None)

@staticmethod
def intorlen(s):
"Cast to int if possible, otherwise get length"
try:
v = int(s)
except ValueError:
v = len(s)
return v

@property
def adapters(self):
r1 = getattr(self, 'Read01', None)
r2 = getattr(self, 'Read02', None)
return list(map(int, filter(None, [r1, r2])))

@property
def indexes(self):
i1 = getattr(self, 'Index01', None)
i2 = getattr(self, 'Index02', None)
return list(map(int, filter(None, [i1, i2])))
return list(map(self.intorlen, filter(None, [r1, r2])))

@property
def is_paired_end(self):
Expand All @@ -128,7 +144,3 @@ def is_single_end(self):
return False
else:
raise ValueError('Unknown endedness from sample sheet')




16 changes: 16 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def get_mounts(*extras):

if extras:
for extra in extras:
if ':' in str(extra):
extra = str(extra).split(':')[0]
if not Path(extra).exists():
raise FileNotFoundError(f"Can't mount {str(extra)}, it doesn't exist!")
mount_binds.extend(extras)
Expand Down Expand Up @@ -233,7 +235,13 @@ def exec_pipeline(configs, dry_run=False, local=False):
for i in range(0, len(configs['run_ids'])):
this_config = {k: (v[i] if k not in skip_config_keys else v) for k, v in configs.items() if v}
this_config.update(profile_config)

extra_to_mount = [this_config['out_to'], this_config['demux_input_dir']]
if this_config['bclconvert']:
bclcon_log_dir = Path(this_config['out_to'], "logs", "bclconvert_demux")
if not bclcon_log_dir.exists():
bclcon_log_dir.mkdir(mode=0o755, parents=True)
extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw")
singularity_binds = get_mounts(*extra_to_mount)
config_file = Path(this_config['out_to'], '.config', f'config_job_{str(i)}.json').absolute()
json.dump(this_config, open(config_file, 'w'), cls=PathJSONEncoder, indent=4)
Expand Down Expand Up @@ -269,3 +277,11 @@ def exec_pipeline(configs, dry_run=False, local=False):
print(' '.join(map(str, this_cmd)))
exec_snakemake(this_cmd, local=local, dry_run=dry_run, env=top_env, cwd=str(Path(this_config['out_to']).absolute()))


def is_bclconvert(samplesheet):
BCLCONVERT_INSTRUMENTS = ('VH01716',)
BCLCONVERT_PLATFORMS = ('NextSeq1k2k',)
check = False
if samplesheet.instrument in BCLCONVERT_INSTRUMENTS or samplesheet.platform in BCLCONVERT_PLATFORMS:
check = True
return check
12 changes: 7 additions & 5 deletions weave
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,20 @@ def run(args):

for (rundir, run_infos) in runs:
sample_sheet = run_infos['samplesheet']
import ipdb; ipdb.set_trace()
sample_list = [
dict(sid=sample.Sample_ID+'_S'+str(i), r1_adapter=sample.Index, r2_adapter=sample.Index2)
for i, sample in enumerate(sample_sheet.samples, start=1)
]
project_list = list(set([_sample.Sample_Project for _sample in sample_sheet.samples]))

if len(project_list) > 1:
raise NotImplementedError("Unable to process multiple projects currently.\n" +
"Please file issue if this message is blocking: https://github.com/OpenOmics/weave/issues")

pairs = ['1', '2'] if sample_sheet.is_paired_end else ['1']

# ~~~ general run configuration ~~~
exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet))
exec_config['run_ids'].append(rundir.name)
exec_config['demux_input_dir'].append(rundir.resolve())
exec_config['sids'].append([x['sid'] for x in sample_list])
Expand All @@ -45,11 +46,12 @@ def run(args):
exec_config['samples'].append(sample_list)

# ~~~ output verification ~~~
files.valid_run_output(args.output, dry_run=args.dry_run)
exec_config['out_to'].append(str(Path(args.output).absolute()))

opdir = Path(args.output, rundir.name).absolute() \
if args.output is not None \
else Path(Path.cwd(), 'output').absolute()
files.valid_run_output(opdir, dry_run=args.dry_run)
exec_config['out_to'].append(opdir)

import ipdb; ipdb.set_trace()
utils.exec_pipeline(exec_config, dry_run=args.dry_run, local=args.local)


Expand Down
4 changes: 2 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ bcl2fastq_outputs = flatten(
bclconvert_outputs = flatten(
[
# ~~ NextSeq2k demultiplexing ~~
expand("{out_to}/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args),
expand("{out_to}/demux/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args),
expand("{out_to}/demux/Reports/Demultiplex_Stats.csv", **demux_expand_args),
expand("{out_to}/demux/Reports/Adapter_Metrics.csv", **demux_expand_args),
expand("{out_to}/demux/.BC_DEMUX_COMPLETE", **demux_expand_args),
Expand All @@ -114,7 +114,7 @@ bclconvert_outputs = flatten(
if config['bclconvert']:
all_outputs = bclconvert_outputs
else:
all_outputs = bcl2fastq
all_outputs = bcl2fastq_outputs


if config["runqc"]:
Expand Down
66 changes: 46 additions & 20 deletions workflow/demux.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ demux_expand_args = {
"rnums": config["rnums"],
"sids": config['sids'],
}

demux_noop_args = dict.fromkeys(demux_expand_args.keys(), [])

rule bcl2fastq:
"""
Expand All @@ -15,13 +15,13 @@ rule bcl2fastq:
"""
input:
run_dir = config['demux_input_dir'],
binary_base_calls = expand("{files}", files=config['bcl_files']),
samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'])
binary_base_calls = expand("{files}", files=config['bcl_files'] if not config['bclconvert'] else demux_noop_args),
samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'] if not config['bclconvert'] else demux_noop_args)
output:
seq_data = expand("{out_to}/demux/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args),
undetermined = expand("{out_to}/demux/Undetermined_S0_R{rnums}_001.fastq.gz", **demux_expand_args),
stats = expand("{out_to}/demux/Stats/Stats.json", **demux_expand_args),
breadcrumb = expand("{out_to}/demux/.B2F_DEMUX_COMPLETE", **demux_expand_args),
seq_data = expand("{out_to}/demux/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args if not config['bclconvert'] else demux_noop_args),
undetermined = expand("{out_to}/demux/Undetermined_S0_R{rnums}_001.fastq.gz", **demux_expand_args if not config['bclconvert'] else demux_noop_args),
stats = expand("{out_to}/demux/Stats/Stats.json", **demux_expand_args if not config['bclconvert'] else demux_noop_args),
breadcrumb = expand("{out_to}/demux/.B2F_DEMUX_COMPLETE", **demux_expand_args if not config['bclconvert'] else demux_noop_args),
params:
out_dir = config["out_to"] + "/demux/",
run = config["run_ids"],
Expand All @@ -46,33 +46,59 @@ rule bcl2fastq:

rule bclconvert:
"""
bcl-convert Version 00.000.000.3.6.3
Copyright (c) 2014-2018 Illumina, Inc.
bcl-convert Version 00.000.000.4.2.4
Run BCL Conversion (BCL directory to *.fastq.gz)
bcl-convert --bcl-input-directory <BCL_ROOT_DIR> --output-directory <PATH> [options]
--bcl-input-directory arg Input BCL directory for BCL conversion (must be specified)
--sample-sheet arg Path to SampleSheet.csv file (default searched for in --bcl-input-directory)
--first-tile-only arg Only convert first tile of input (for testing & debugging)
--bcl-sampleproject-subdirectories arg Output to subdirectories based upon sample sheet 'Sample_Project' column
--sample-name-column-enabled arg Use sample sheet 'Sample_Name' column when naming fastq files & subdirectories
--fastq-gzip-compression-level arg Set fastq output compression level 0-9 (default 1)
--bcl-num-parallel-tiles arg # of tiles to process in parallel (default 1)
--bcl-num-conversion-threads arg # of threads for conversion (per tile, default # cpu threads)
--bcl-num-compression-threads arg # of threads for fastq.gz output compression (per tile, default # cpu threads,
or HW+12)
--bcl-num-decompression-threads arg # of threads for bcl/cbcl input decompression (per tile, default half # cpu
threads, or HW+8. Only applies when preloading files)
--bcl-only-matched-reads arg For pure BCL conversion, do not output files for 'Undetermined' [unmatched]
reads (output by default)
--no-lane-splitting arg Do not split FASTQ file by lane (false by default)
"""
input:
run_dir = config['demux_input_dir'],
binary_base_calls = expand("{files}", files=config['bcl_files']),
samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'])
samplesheets = expand("{run}/RunInfo.xml", run=config['demux_input_dir'])
binary_base_calls = expand("{files}", files=config['bcl_files'] if config['bclconvert'] else demux_noop_args),
samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args),
runinfo = expand("{run}/RunInfo.xml", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args),
params:
out_dir = config["out_to"] + "/demux/",
run = config["run_ids"],
project = config["project"],
mv_dir = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"]
mv_dir = config["out_to"] + "/demux/" + config["run_ids"] + "/" + config["project"]
output:
seq_data = expand("{out_to}/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args),
undetermined = expand("{out_to}/demux/Undetermined_S0_R{rnums}_001.fastq.gz", **demux_expand_args),
stats = expand("{out_to}/demux/Reports/Demultiplex_Stats.csv", **demux_expand_args),
metrics = expand("{out_to}/demux/Reports/Adapter_Metrics.csv", **demux_expand_args),
breadcrumb = expand("{out_to}/demux/.BC_DEMUX_COMPLETE", **demux_expand_args),
container: config["resources"]["sif"] + "weave_bclconvert_0.0.2.sif",
seq_data = expand("{out_to}/demux/{rid}/{project}/{sids}_R{rnums}_001.fastq.gz", **demux_expand_args if config['bclconvert'] else demux_noop_args),
undetermined = expand("{out_to}/demux/Undetermined_S0_R{rnums}_001.fastq.gz", **demux_expand_args if config['bclconvert'] else demux_noop_args),
stats = expand("{out_to}/demux/Reports/Demultiplex_Stats.csv", **demux_expand_args if config['bclconvert'] else demux_noop_args),
metrics = expand("{out_to}/demux/Reports/Adapter_Metrics.csv", **demux_expand_args if config['bclconvert'] else demux_noop_args),
breadcrumb = expand("{out_to}/demux/.BC_DEMUX_COMPLETE", **demux_expand_args if config['bclconvert'] else demux_noop_args),
container: config["resources"]["sif"] + "weave_bclconvert_0.0.3.sif",
threads: 24
shell:
"""
bcl-convert --bcl-input-directory {input.run_dir} --output-directory {params.out_dir}
bcl-convert \
--bcl-input-directory {input.run_dir} \
--force \
--output-directory {params.out_dir} \
--fastq-gzip-compression-level 9 \
--bcl-sampleproject-subdirectories true \
--bcl-num-conversion-threads 8 \
--bcl-num-compression-threads 8 \
--bcl-num-decompression-threads 8 \
--bcl-num-parallel-tiles 3 \
--no-lane-splitting true
mkdir -p {params.mv_dir}
mv {params.out_dir}/*.fastq.gz {params.mv_dir}
touch {output.breadcrumb}
Expand Down
5 changes: 3 additions & 2 deletions workflow/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ qc_expand_args = {
"rnums": config["rnums"],
"sids": config['sids'],
}
demux_stats = config['out_to'] + "/demux/Reports/Demultiplex_Stats.csv" if config['bclconvert'] else config['out_to'] + "/demux/Stats/Stats.json"


rule fastqc_untrimmed:
Expand Down Expand Up @@ -44,8 +45,8 @@ rule fastqc_trimmed:

rule multiqc_report:
input:
# bcl2fastq
config['out_to'] + "/demux/Stats/Stats.json",
# demux status
demux_stats,
# fastqc on untrimmed reads
expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.zip", **qc_expand_args),
# fastqc on trimmed reads
Expand Down

0 comments on commit fb080cf

Please sign in to comment.