diff --git a/README.md b/README.md
index baff5e3..498d8c2 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,22 @@
-<div align="center">
-    <h1>ScanNeo2 (working title)</h1>
+-<div align="left">
+    <h1>ScanNeo2</h1>
+    <img src="https://img.shields.io/badge/snakemake-≥6.4.1-brightgreen.svg">
     <img src="https://github.com/ylab-hi/ScanNeo2/actions/workflows/linting.yml/badge.svg" alt="Workflow status badge">
 </div>
 
-
 ## What is ScanNeo2
-`Scanneo2` is a snakemake workflow for the prediction of neoantigens from 
-multiple sources. In its current state, this includes 
+`Scanneo2` is a snakemake workflow for the prediction of neoantigens from multiple sources. In its current state, 
+this includes canonical-splicing, exitron-splicing, gene fusion, indels and snvs.
 
 ## Getting Started
 
-In principle, Scanneo2 aims to 
-
+In principle, Scanneo2 aims to resolve its dependencies automatically and requires only snakemake and snakedeploy.
 
 ## Quickstart
 
 Install `snakemake` and `snakedeploy`
 ```
-mamba env create --file https://
+mamba env create --file https://github.com/ylab-hi/ScanNeo2/blob/devel/environment.yml
 mamba activate scanneo2
 ```
 Deploy Scanneo2
@@ -26,17 +25,19 @@ mkdir -p /path/to/working/directory/
 cd /path/to/working/directory/
 snakedeploy deploy-workflow https://github.com/ylab-hi/scanneo2 . --tag v0.1.0
 ```
+Configure ScanNeo2 by modifying `config/config.yml`
+
 Run the workflow
 ```
+cd scanneo
 snakemake --cores all --use-conda
 ```
-
-Please consult the wiki for detailed instruction and explanations.
-
+Please consult the [wiki](https://github.com/ylab-hi/ScanNeo2/wiki) for detailed instruction and explanations.
 
 ### Docker
 
-We also provide a ready to use Docker Container that can be used 
+We also provide a ready-to-use [Docker Container](https://hub.docker.com/r/yanglabinfo/scanneo2) 
+that can be used to use Scanneo2.
 
 
 
diff --git a/config/config.yaml b/config/config.yaml
index 65e1682..da9d3ad 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,11 +1,30 @@
-refgen: testdata/GRCh38.p13.genome.fa
-annotation: testdata/GRCh38_chr1.gtf  
+data:
+  name:  patient2
+  dnaseq: 
+    rep1: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz
+    rep2: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz
+  rnaseq:
+    rep1: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz
 
+
+
+
+sample: patient2  # naming of the sample (results/sample)
 dnaseq:
 rnaseq: LNCAP_bam/G28033.LNCaP_clone_FGC.1.bam
 
 
-preproc: true
+### pre-processing (only applied on fastq reads)
+preproc: 
+  activate: true  # whether (=true) or not (=false) to include pre-processing
+  minlen: 10  # discard reads which are less than <minlen> bases 
+  leading: 3  # remove leading low quality or N bases
+  trailing: 3  # remove trailing low quality or N bases
+  slidingwindow:
+    activate: true  # whether (=true) or not (=false) to include pre-processing
+    windowsize: 3  # number of bases to average across
+    quality: 20  # the average quality (Phred) required
+  adapters: TruSeq2-PE.fa    # path to fasta file containg adapters to be trimmed
 
 
 # if no readgroups file are provided 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 799764a..6ebeb62 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,61 +1,251 @@
 import os
-import re
 import glob
+from icecream import ic
 from pathlib import Path
 
+def data_structure(data):
+  config['data']['dnaseq'], filetype, readtype  = handle_seqfiles(config['data']['dnaseq'])
+  config['data']['dnaseq_filetype'] = filetype
+  config['data']['dnaseq_readtype'] = readtype
+  config['data']['rnaseq'], filetype, readtype  = handle_seqfiles(config['data']['rnaseq'])
+  config['data']['rnaseq_filetype'] = filetype
+  config['data']['rnaseq_readtype'] = readtype
+  return config['data']
+
+  
+def handle_seqfiles(seqdata):
+  readtype = []
+  filetype = []
+
+  # iterate over replicates
+  for rpl in seqdata.keys():
+    files = [Path(file) for file in seqdata[rpl].split(' ')]
+    ic(files)
+    if len(files) == 1:  # SE
+      f1_ext = get_file_extension(files[0])
+      if f1_ext in ['.fq', '.fastq', '.bam']:
+        seqdata[rpl] = files[0]
+        filetype.append(f1_ext)
+        readtype.append('SE')
+      else:
+        print('{} is not a valid file'.format(files[0]))
+    elif len(files) == 2:  # PE
+      f1_ext = get_file_extension(files[0])
+      f2_ext = get_file_extension(files[1])
+      # check if file extensions are the same
+      if f1_ext == f2_ext:
+        if(valid_paired_end(files[0], files[1])):
+          seqdata[rpl] = files
+          filetype.append(f1_ext)
+          readtype.append('PE')
+        else:
+          print('files not in valid PE format')
+      else:
+        print('files do not have the same extension')
+
+  # check if filetype and readtype are the same
+  if all_identical(filetype) and all_identical(readtype):
+    return seqdata, filetype[0], readtype[0]
+  else:
+    print('filetypes are not the same')
+    return seqdata, None, None
+
+# pre-processing for RNAseq data
+def get_raw_reads(wildcards):
+  if config['data']['rnaseq_readtype'] == 'SE':
+    return config['data']['rnaseq'][wildcards.sample]
+  else:  # PE
+    return dict(
+        zip(
+            ["r1", "r2"],
+            [config['data'][wildcards.seqtype][wildcards.replicate][0],
+             config['data'][wildcards.seqtype][wildcards.replicate][1]],
+        )
+    )
+
+# returns the reads (raw/preprocessed) for a given sample
+def get_reads(wildcards):
+  if config['preproc']['activate']:
+    if config['data'][wildcards.seqtype+'_readtype'] == 'SE':
+      return config['data'][wildcards.seqtype][wildcards.replicate]
+    elif config['data'][wildcards.seqtype+'_readtype'] == 'PE':
+      print("yes")
+      return {"r1": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+              "r2": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz"}
+
+
+# determines the file extension for a given file - excludes .gz
+def get_file_extension(path):
+  filename = path.name
+  extpat = r'\.(fastq|fq|bam)(\.gz)?$'
+  res = re.search(extpat, filename)
+  file_ext = ''
+  if res is not None:
+      if res.group(0).endswith('.gz'):
+          file_ext = filename[res.start():-3]
+      else:
+          file_ext = filename[res.start():]
+  return file_ext
+
 # check if files are a valid paired-end pair
-def valid_paired_end(file1, file2):
-    # check if both files are in FASTA format
-    if Path(file1).suffix not in ['.fastq', '.fq'] and Path(file2).suffix not in ['.fastq', '.fq']:
-        return False
-
-    # check if first file contains _R1 or _fwd
-    if not ("_R1" in file1 or "_fwd" in file1):
-        return False
-    # check if second file contains _R2 or _rev
-    if not ("_R2" in file2 or "_rev" in file2):
-        return False
-
-    # check if the substrings until occurrence of either _R1/2 or _fwd/rev are equal
-    file1_indicator = "_R1" if "_R1" in file1 else "_fwd"  
-    file2_indicator = "_R2" if "_R2" in file2 else "_rev"
-    file1_idx = file1.find(file1_indicator)
-    file2_idx = file2.find(file2_indicator)
-
-    if file1_idx != file2_idx:
-        return False
-    
-    if file1[:file1_idx] != file2[:file2_idx]:
-        return False
+def valid_paired_end(path1, path2):
+  valid = False
+  
+  # only consider filename
+  file1 = path1.name 
+  file2 = path2.name
 
-    # check if first file contains _R1 and the secon file contaif "_R1" in string1 and "_R2" not in string2:
-        return False
-    
-    # check if first file contains _fwd and the secon file contains _rev
-    if "_fwd" in string1 and "_rev" not in string2:
-        return False
+  # check if first file contains _R1, _1 or _fwd
+  pattern = r'\_(R1|R2|1|2|fwd|rev)\.(fastq|fq){1}(\.gz)?$'
+  f1_se = re.search(pattern, file1)
+  f2_se = re.search(pattern, file1)
+
+  # patterns needs to be found in both files
+  if f1_se is not None and f2_se is not None:
+    if file1[:f1_se.start()] == file2[:f2_se.start()]:
+      valid = True
+    else:
+      print('{} and {} have different filestem '.format(file1, file2))
+  else:
+    print('{} and {} are not valid PE files'.format(file1, file2))
+
+  return valid
 
+# check if files in list are identical
+def all_identical(l):
+  if l.count(l[0]) == len(l):
     return True
+  else:
+    return False
 
 
-rnaseq_input = {}
-rnaseq_filetype = None
-rnaseq_files = config['rnaseq'].split(' ')
-if len(rnaseq_files) == 1:
-    if Path(rnaseq_files[0]).suffix in ['.fq', '.fastq', '.bam']:
-        rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension
-        rnaseq_input[Path(rnaseq_files[0]).stem] = rnaseq_files[0]
-    else:
-        print("no rnaseq files found")
-elif len(rnaseq_files) == 2:
-    # check if both files are in valid paired-end format
-    if valid_paired_end(rnaseq_files[0], rnaseq_files[1]):
-        rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension
-        rnaseq_input.append((rnaseq_files[0], rnaseq_files[1]))
-    else:
-        print("no valid paired-end files found")
+config['data'] = data_structure(config['data'])
+ic(config['data'])
+
+
+rnaseq_filetype = ".bam"
+
+def get_splitfastq_input(wildcards):
+  if config['preproc']['activate']:
+    if config['data'][wildcards.seqtype] == 'SE':
+      return expand("results/{sample}/{seqtype}/preproc/reads.fq.gz", **wildcards)
+    elif config['data'][wildcards.seqtype] == 'PE':  # PE
+      return expand("results/{sample}/reads/{seqtype}/{replicate}_preproc_{readtype}.fq.gz",
+                    readtype=["r1", "r2"],
+                    seqtype=wildcards.seqtype,
+                    replicate = wildcards.replicate,
+                    sample = wildcards.sample)
+
+  else:   # no pre-processing has been performed
+    return rnaseq_input[wildcards.sample]
+
+
+def get_splitfastq_input_PE(wildcards):
+  if config['preproc']['activate']:
+    return expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz",
+        readtype=["r1", "r2"],
+        **wildcards
+    )
+  else:  # no pre-processing
+    return rnaseq_input[wildcards.sample]
+
+
+
+# input for alignment w/ DNAseq data
+def get_align_input_dnaseq(wildcards):
+  if config['preproc']['activate']:
+
 
 
+
+
+
+    if config['data']['dnaseq_readtype'] == 'SE':
+      return expand("results/{sample}/dnaseq/reads/inputreads.fq.gz", **wildcards)
+    else:  # PE
+      return dict(
+          zip(
+              ["fq1", "fq2"],
+              expand("results/{sample}/dnaseq/reads/inputreads_{readtype}.fq.gz",
+                     readtype=["r1", "r2"],
+                     **wildcards
+              )
+          )
+      )
+  else:  # no pre-processing
+    if config['data']['dnaseq_readtype'] == 'SE':
+      return dnaseq_input[wildcards.sample]
+    else:  # PE
+      return dict(
+          zip(
+              ["fq1", "fq2"],
+              dnaseq_input[wildcards.sample]
+          )
+      )
+
+
+
+def get_align_input(wildcards):
+  if config['preproc']['activate']:
+    if rnaseq_readtype == "se":
+      return expand("results/{sample}/rnaseq/reads/inputreads.fq.gz", **wildcards)
+    else:  # pe
+      ic("in here")
+      return dict(
+          zip(
+              ["fq1", "fq2"],
+              expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz",
+                     readtype=["r1", "r2"],
+                     **wildcards
+              )
+          )
+      )
+  else:  # no pre-processing
+    if rnaseq_readtype == "se":
+      return rnaseq_input[wildcards.sample]
+    else:  # pe
+      return dict(
+          zip(
+              ["fq1", "fq2"],
+              rnaseq_input[wildcards.sample]
+          )
+      )
+
+# determine the bamfiles that contains readgroups
+#def get_bams_readsgroups(wildcards):
+    
+#    fq1 = "results/{sample}/rnaseq/reads/{replicate}/r1/reads_{i}.fq.gz",
+#    aln = "results/{sample}/rnaseq/align/{replicate}/reads_{i}.bam",
+
+
+def get_readgroups_input(wildcards):
+  # return only bam from STAR align
+  if config['data']['rnaseq_filetype'] in ['.fq','.fastq']:
+    return ["results/{sample}/rnaseq/align/{replicate}_ready.bam".format(**wildcards)]
+  elif config['data']['rnaseq_readtype'] in ['.bam']:
+    val = []
+    val.append(config['data']['rnaseq'][wildcards.replicate][wildcards.sample])
+    val.append(extend("results/{sample}/rnaseq/align/{replicate}/ready.bam",
+        sample=wildcards.sample,
+        replicate=wildcards.replicate))
+
+
+
+def aggregate_alignments_fastq(wildcards):
+    # make sure that all samples are processed in checkpoint - split fastq file
+    checkpoint_output = checkpoints.splitfastq.get(**wildcards).output[0]
+    return expand("results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam",
+        sample=wildcards.sample,
+        replicate=wildcards.replicate,
+        i=glob_wildcards(os.path.join(checkpoint_output, "r1/reads_{i}.fq.gz")).i)
+
+def aggregate_alignments_pe(wildcards):
+    # make sure that all samples are processed in checkpoint - split fastq file
+    checkpoint_output = checkpoints.splitfastq_pe.get(**wildcards).output[0]
+    return expand("results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam",
+        sample=wildcards.sample,
+        i=glob_wildcards(os.path.join(checkpoint_output, "inputreads_{i}.fq.gz")).i)
+
 # aggregate results from STAR alignment
 def aggregate_alignments(wildcards):
     # make sure that all samples are processed in checkpoint - split fastq file
@@ -65,19 +255,16 @@ def aggregate_alignments(wildcards):
         readgroup=glob_wildcards(os.path.join(checkpoint_output, "{readgroup}.bam")).readgroup)
 
 
-def aggregate_bwa_alignments(wildcards):
-    split_bamfiles_output = checkpoints.bamfile_split.get(**wildcards).output[0]
-    return expand("results/{sample}/rnaseq/align/bamfiles/{file}.bam",
-        sample=wildcards.sample,
-        file=glob_wildcards(os.path.join(split_bamfiles_output, "{file}.bam")).file)
-    
-
+# getting input starting from align
 def get_rnaseq_data(wildcards):
+    print(rnaseq_input)
     if rnaseq_filetype == ".bam":
-        print(wildcards)
         return rnaseq_input[wildcards.sample]
     elif rnaseq_filetype == ".fastq" or rnaseq_filetype == ".fq":
-        return rnaseq_input[wildcards.sample]
+        if config['preproc']['activate']:  # preproc activated?
+            if len(rnaseq_input[wildcards.sample] == 2):  # PE?
+                return expand("results/{sample}/preproc/trimmed/trm.fq.gz",
+                    **wildcards)
     else:
         print("no rnaseq data found")
 
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
index 8ad79e9..01b4129 100644
--- a/workflow/rules/germline.smk
+++ b/workflow/rules/germline.smk
@@ -1,25 +1,6 @@
-rule picard_create_dict:
-    input:
-        "resources/refs/genome.fasta"
-    output:
-        "resources/refs/genome.dict"
-    log:
-        "logs/picard/create_dict.log"
-    params:
-        extra="",  # optional: extra arguments for picard.
-    # optional specification of memory usage of the JVM that snakemake will respect with global
-    # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources)
-    # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`:
-    # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties
-    resources:
-        mem_mb=1024,
-    wrapper:
-        "v1.31.1/bio/picard/createsequencedictionary"
-
-
 # download training sets for calling high confidence variants
 # see https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR-
-rule gatk_vqsr_training_sets:
+rule get_gatk_vqsr_training_sets:
     output:
         snp_hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
         snp_hapmap_idx="resources/vqsr/hapmap_3.3.hg38.vcf.gz.tbi",
@@ -31,7 +12,8 @@ rule gatk_vqsr_training_sets:
         snp_dbSNP_idx="resources/vqsr/dbSNP_b150.vcf.gz.tbi",
         indel_mills="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
         indel_mills_idx="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
-
+    message:
+      "Downloading training sets for calling high confidence variants"
     shell:
         """
         curl -L https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/hapmap_3.3.hg38.vcf.gz -o resources/vqsr/hapmap_3.3.hg38.vcf.gz
@@ -47,16 +29,18 @@ rule gatk_vqsr_training_sets:
         """
 
 # do a first round of variant calling on original, unrecalibrated data
-rule htc_first:
+rule detect_indels_htc_1rd:
     input:
         # single or list of bam files
-        bam="results/{sample}/rnaseq/align/realigned.bam",
+        bam="results/{sample}/rnaseq/align/{replicate}_realigned.bam",
         ref="resources/refs/genome.fasta",
         known="resources/vqsr/dbSNP_b150.vcf.gz"  # optional
     output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf"
+        vcf="results/{sample}/variants/indel/htcaller/{replicate}_variants.1rd.vcf"
+    message:
+      "First round of variant calling on original, unrecalibrated data on sample:{wildcards.sample} with replicate:{wildcards.replicate}" 
     log:
-        "logs/{sample}/gatk/haplotypecaller/1rd.log",
+        "logs/{sample}/gatk/haplotypecaller/{replicate}_1rd.log",
     params:
         extra="", 
         java_opts="",
@@ -68,216 +52,223 @@ rule htc_first:
 
 
 # recalibrate variants (SNP)
-rule htc_first_snp_recal:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
-        ref="resources/refs/genome.fasta",
-        hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
-        omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
-        g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-        dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
-        idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches"
-    log:
-        "logs/{sample}/gatk/vqsr/recal.first.snp"
+#rule recalibrate_variants_first_round:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf",
+        #ref="resources/refs/genome.fasta",
+        #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+        #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+        #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+        #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf",
+        #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf.idx",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches"
+    #message:
+      #"Recalibrate variants (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+    #log:
+        #"logs/{sample}/gatk/vqsr/{replicate}_1rd_snv_recal.log"
 
-    params:
-        mode="SNP", 
-        resources={
-            "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
-            "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
-            "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
-            "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
-        },
-        annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
-        extra=""
-    threads: config['threads']
-    resources:
-        mem_mb=1024,
-    wrapper:
-        "v1.31.1/bio/gatk/variantrecalibrator"
+    #params:
+        #mode="SNP", 
+        #resources={
+            #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+            #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+            #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+            #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+        #},
+        #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+        #extra=""
+    #threads: config['threads']
+    #resources:
+        #mem_mb=1024,
+    #wrapper:
+        #"v1.31.1/bio/gatk/variantrecalibrator"
 
 
-rule htc_first_snp_apply_vqsr:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
-        recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches",
-        ref="resources/refs/genome.fasta",
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"
-    log:
-        "logs/{sample}/gatk/vqsr/apply.1rd.snp.log"
-    params:
-        mode="SNP",  # set mode, must be either SNP, INDEL or BOTH
-        extra="--truth-sensitivity-filter-level 99.5",  # optional
-    resources:
-        mem_mb=1024,
-    wrapper:
-        "v1.31.1/bio/gatk/applyvqsr"
+#rule apply_VQSR_SNVs_1rd:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf",
+        #recal="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches",
+        #ref="resources/refs/genome.fasta",
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.flt.vcf"
+    #message:
+      #"Apply VQSR (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+    #log:
+        #"logs/{sample}/gatk/vqsr/{replicate}_apply.1rd.snp.log"
+    #params:
+        #mode="SNP",  # set mode, must be either SNP, INDEL or BOTH
+        #extra="--truth-sensitivity-filter-level 99.5",  # optional
+    #resources:
+        #mem_mb=1024,
+    #wrapper:
+        #"v1.31.1/bio/gatk/applyvqsr"
 
 
-# repeat Variant Quality Score Recalibration for indels
-use rule htc_first_snp_recal as htc_first_indel_recal with:
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
-        idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf.idx",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches"
-    log:
-        "logs/{sample}/gatk/vqsr/recal_first_snp.log"
-    params:
-        mode="INDEL", 
-        resources={
-            "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
-            "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
-            "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
-            "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
-        },
-        annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
-        extra=""
+### repeat Variant Quality Score Recalibration for indels
+#use rule recal_VQSE_SNVs_1rd as recal_VQSE_Indels_1rd with:
+  #output:
+      #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf",
+      #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf.idx",
+      #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.tranches"
+  #message:
+    #"Recalibrate variants (INDEL) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+  #log:
+    #"logs/{sample}/gatk/vqsr/{replicate}_1rd_indel_recal.log"
+  #params:
+    #mode="BOTH", 
+    #resources={
+      #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+      #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+      #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+      #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+      #},
+    #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+    #extra=""
+        
 
 
-use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
-        recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches",
-        ref="resources/refs/genome.fasta",
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf"
-    output:
-    log:
-        "logs/{sample}/gatk/vqsr/apply.final.indel.log"
-    params:
-        mode="INDEL", 
-        extra="--truth-sensitivity-filter-level 99.0",  # optional
+#use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
+        #recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches",
+        #ref="resources/refs/genome.fasta",
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf"
+    #output:
+    #log:
+        #"logs/{sample}/gatk/vqsr/apply.final.indel.log"
+    #params:
+        #mode="INDEL", 
+        #extra="--truth-sensitivity-filter-level 99.0",  # optional
 
 
-rule gatk_baserecalibrator:
-    input:
-        bam="results/{sample}/rnaseq/align/realigned.bam",
-        ref="resources/refs/genome.fasta",
-        dict="resources/refs/genome.dict",
-        known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf", 
-            "results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"]
-    output:
-        recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
-    log:
-        "logs/gatk/baserecalibrator/{sample}.log",
-    params:
-        extra="",  # optional
-        java_opts="",  # optional
-    resources:
-        mem_mb=10240,
-    wrapper:
-        "v1.31.1/bio/gatk/baserecalibrator"
+#rule gatk_baserecalibrator:
+    #input:
+        #bam="results/{sample}/rnaseq/align/realigned.bam",
+        #ref="resources/refs/genome.fasta",
+        #dict="resources/refs/genome.dict",
+        #known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf", 
+            #"results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"]
+    #output:
+        #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
+    #log:
+        #"logs/gatk/baserecalibrator/{sample}.log",
+    #params:
+        #extra="",  # optional
+        #java_opts="",  # optional
+    #resources:
+        #mem_mb=10240,
+    #wrapper:
+        #"v1.31.1/bio/gatk/baserecalibrator"
 
 
-rule gatk_applybqsr:
-    input:
-        bam="results/{sample}/rnaseq/align/realigned.bam",
-        ref="resources/refs/genome.fasta",
-        dict="resources/refs/genome.dict",
-        recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
-    output:
-        bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam"
-    log:
-        "logs/gatk/gatk_applybqsr/{sample}.log",
-    params:
-        extra="",  # optional
-        java_opts="",  # optional
-        embed_ref=True,  # embed the reference in cram output
-    resources:
-        mem_mb=1024,
-    wrapper:
-        "v1.31.1/bio/gatk/applybqsr"
+#rule gatk_applybqsr:
+    #input:
+        #bam="results/{sample}/rnaseq/align/realigned.bam",
+        #ref="resources/refs/genome.fasta",
+        #dict="resources/refs/genome.dict",
+        #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
+    #output:
+        #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam"
+    #log:
+        #"logs/gatk/gatk_applybqsr/{sample}.log",
+    #params:
+        #extra="",  # optional
+        #java_opts="",  # optional
+        #embed_ref=True,  # embed the reference in cram output
+    #resources:
+        #mem_mb=1024,
+    #wrapper:
+        #"v1.31.1/bio/gatk/applybqsr"
 
-rule htcaller_main:
-    input:
-        # single or list of bam files
-        bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam",
-        ref="resources/refs/genome.fasta",
-        known="resources/vqsr/dbSNP_b150.vcf.gz"  # optional
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf"
-    log:
-        "logs/gatk/haplotypecaller/{sample}.1rd.log",
-    params:
-        extra="",  # optional
-        java_opts="",  # optional
-    threads: config['threads']
-    resources:
-        mem_mb=1024,
-    wrapper:
-        "v1.31.1/bio/gatk/haplotypecaller"
+#rule htcaller_main:
+    #input:
+        ## single or list of bam files
+        #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam",
+        #ref="resources/refs/genome.fasta",
+        #known="resources/vqsr/dbSNP_b150.vcf.gz"  # optional
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf"
+    #log:
+        #"logs/gatk/haplotypecaller/{sample}.1rd.log",
+    #params:
+        #extra="",  # optional
+        #java_opts="",  # optional
+    #threads: config['threads']
+    #resources:
+        #mem_mb=1024,
+    #wrapper:
+        #"v1.31.1/bio/gatk/haplotypecaller"
 
 
-use rule htc_first_snp_recal as htc_final_snp_recal with:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
-        ref="resources/refs/genome.fasta",
-        hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
-        omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
-        g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-        dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
-        idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches"
-    log:
-        "logs/{sample}/gatk/vqsr/recal.final.snp.log"
+#use rule htc_first_snp_recal as htc_final_snp_recal with:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+        #ref="resources/refs/genome.fasta",
+        #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+        #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+        #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+        #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
+        #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches"
+    #log:
+        #"logs/{sample}/gatk/vqsr/recal.final.snp.log"
 
 
-use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
-        recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches",
-        ref="resources/refs/genome.fasta",
-    output:
-        vcf="results/{sample}/variants/germ.snvs.vcf"
-    log:
-        "logs/{sample}/gatk/vqsr/apply.final.snp.log"
+#use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+        #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches",
+        #ref="resources/refs/genome.fasta",
+    #output:
+        #vcf="results/{sample}/variants/germ.snvs.vcf"
+    #log:
+        #"logs/{sample}/gatk/vqsr/apply.final.snp.log"
 
 
-use rule htc_first_snp_recal as htc_final_indel_recal with:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
-        ref="resources/refs/genome.fasta",
-        hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
-        omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
-        g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-        dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
-    output:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
-        idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches"
-    log:
-        "logs/{sample}/gatk/vqsr/recal.final.indel.log"
-    params:
-        mode="INDEL",  
-        resources={
-            "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
-            "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
-            "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
-            "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
-        },
-        annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
-        extra=""
+#use rule htc_first_snp_recal as htc_final_indel_recal with:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+        #ref="resources/refs/genome.fasta",
+        #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+        #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+        #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+        #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+    #output:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
+        #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches"
+    #log:
+        #"logs/{sample}/gatk/vqsr/recal.final.indel.log"
+    #params:
+        #mode="INDEL",  
+        #resources={
+            #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+            #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+            #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+            #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+        #},
+        #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+        #extra=""
         
 
-use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with:
-    input:
-        vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
-        recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
-        tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches",
-        ref="resources/refs/genome.fasta",
-    output:
-        vcf="results/{sample}/variants/germ.indel.vcf"
-    log:
-        "logs/{sample}/gatk/vqsr/apply.final.indel.log"
-    params:
-        mode="INDEL", 
-        extra="--truth-sensitivity-filter-level 99.0",  # optional
+#use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with:
+    #input:
+        #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+        #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
+        #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches",
+        #ref="resources/refs/genome.fasta",
+    #output:
+        #vcf="results/{sample}/variants/germ.indel.vcf"
+    #log:
+        #"logs/{sample}/gatk/vqsr/apply.final.indel.log"
+    #params:
+        #mode="INDEL", 
+        #extra="--truth-sensitivity-filter-level 99.0",  # optional
diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk
index b73b824..6de2e1c 100644
--- a/workflow/rules/hlatyping.smk
+++ b/workflow/rules/hlatyping.smk
@@ -1,7 +1,11 @@
 rule get_hla_panel:
     output:
-        dna="misc/hla/hla_reference_dna.fasta",
-        rna="misc/hla/hla_reference_rna.fasta"
+        dna="resources/hla/hla_ref_dna.fasta",
+        rna="resources/hla/hla_ref_rna.fasta"
+    conda:
+        "../envs/basic.yml"
+    log:
+        "logs/hla_panel.log"
     shell:
         """
         curl -o {output.dna} https://raw.githubusercontent.com/FRED-2/OptiType/v1.3.5/data/hla_reference_dna.fasta
@@ -10,19 +14,27 @@ rule get_hla_panel:
 
 rule index_hla_panel:
     input:
-        dna="misc/hla/hla_reference_dna.fasta",
-        rna="misc/hla/hla_reference_rna.fasta",
+        dna="resources/hla/hla_ref_dna.fasta",
+        rna="resources/hla/hla_ref_rna.fasta"
     output:
-        "misc/hla/hla_dna.index.lf.drp", 
-        "misc/hla/hla_rna.index.lf.drp", 
+        dna=multiext("resources/hla/yara/idx/dna", 
+            ".lf.drp", ".lf.drs", ".lf.drv", 
+            ".lf.pst", ".rid.concat", ".rid.limits",
+            ".sa.ind", ".sa.len", ".sa.val", 
+            ".txt.concat", ".txt.limits", ".txt.size"),
+        rna=multiext("resources/hla/yara/idx/rna", 
+            ".lf.drp", ".lf.drs", ".lf.drv", 
+            ".lf.pst", ".rid.concat", ".rid.limits",
+            ".sa.ind", ".sa.len", ".sa.val", 
+            ".txt.concat", ".txt.limits", ".txt.size")
     log:
         "logs/yara_indexer.log"
     conda:
         "../envs/yara.yml"
     shell:
         """
-        yara_indexer -o misc/hla/hla.index {input.dna} > {log}
-        yara_indexer -o misc/hla/hla.index {input.rna} >> {log}
+        yara_indexer -o resources/hla/yara/idx/dna {input.dna} > {log}
+        yara_indexer -o resources/hla/yara/idx/rna {input.dna} >> {log}
         """
 
 rule prepare_bams:
@@ -37,6 +49,7 @@ rule prepare_bams:
     shell:
         "samtools merge -f - {input} | samtools fastq - > {output}"
 
+
 rule filter_hla:
     input:
         reads="results/hla/all.fastq",
diff --git a/workflow/rules/indel.smk b/workflow/rules/indel.smk
index 10aa24f..87a7a90 100644
--- a/workflow/rules/indel.smk
+++ b/workflow/rules/indel.smk
@@ -1,15 +1,17 @@
 import os
 from snakemake.remote import HTTP
 
-rule transindel_build:
+rule detect_long_indel_ti_build:
     input:
-        bam = "results/{sample}/rnaseq/align/realigned.bam",
-        idx = "results/{sample}/rnaseq/align/realigned.bam.bai"
+        bam = "results/{sample}/rnaseq/align/{replicate}_realigned.bam",
+        idx = "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai"
     output:
-        bam="results/{sample}/rnaseq/indel/transindel/build.bam",
-        idx="results/{sample}/rnaseq/indel/transindel/build.bam.bai"
+        bam="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam",
+        idx="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai"
+    message:
+      "Building new BAM file with redefined CIGAR string using transindel build on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
     log:
-        "logs/transindel/build/{sample}.log"
+        "logs/{sample}/transindel/{replicate}_build.log"
     conda:
         "../envs/transindel.yml"
     shell:
@@ -18,18 +20,20 @@ rule transindel_build:
         -i {input.bam} \
         -o {output.bam} \
         -r resources/refs/genome.fasta \
-        -g resources/refs/genome.gtf > {log} 
-        samtools index {output.bam} -o {output.idx} >> {log} 
+        -g resources/refs/genome.gtf > {log} 2>&1
+        samtools index {output.bam} -o {output.idx} >> {log} 2>&1
         """
 
-rule transindel_call:
+rule detect_long_indel_ti_call:
     input:
-        bam = "results/{sample}/rnaseq/indel/transindel/build.bam",
-        bai = "results/{sample}/rnaseq/indel/transindel/build.bam.bai"
+        bam = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam",
+        bai = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai"
     output:
-        "results/{sample}/rnaseq/indel/transindel/call.indel.vcf"
+        "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf"
+    message:
+      "Calling short indels using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
     log:
-        "logs/transindel/call/{sample}.log"
+        "logs/{sample}/transindel/{replicate}_call.log"
     conda:
         "../envs/transindel.yml"
     params:
@@ -39,59 +43,84 @@ rule transindel_call:
         python workflow/scripts/transIndel/transIndel_call.py \
         -i {input.bam} \
         -l 10 \
-        -o results/{wildcards.sample}/rnaseq/indel/transindel/call \
-        -m {params}
+        -o results/{wildcards.sample}/rnaseq/indel/transindel/{wildcards.replicate}_call \
+        -m {params} > {log} 2>&1
         """
 
 # resove alleles and remove PCR slippage
-rule slippage_removal:
+rule long_indel_slippage_removal:
     input:
-        "results/{sample}/rnaseq/indel/transindel/call.indel.vcf"
+        "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf"
     output:
-        "results/{sample}/variants/long.indel.vcf"
+        "results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf"
+    message:
+      "Resolving alleles and removing PCR slippage using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
     log:
-        "logs/indel/sliprem{sample}.log"
+        "logs/{sample}/transindel/{replicate}_sliprem.log"
     conda:
         "../envs/transindel.yml"
     shell:
         """
             python3 workflow/scripts/slippage_removal.py \
-            resources/refs/genome.fasta {input} {output} > {log} 
+            resources/refs/genome.fasta {input} {output} > {log} 2>&1
         """
 
-rule gatk_mutect2:
+# combines the replicates into one vcf
+rule combine_longindels:
+  input:
+    expand("results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf", 
+           sample=config['data']['name'],
+           replicate=config['data']['rnaseq'].keys())
+  output:
+    "results/{sample}/rnaseq/indel/long.indel.vcf" 
+  message:
+    "Combining long indels from replicates on sample:{wildcards.sample}"
+  log: 
+    "logs/{sample}/transindel/combine_replicates.log"
+  conda:
+    "../envs/manipulate_vcf.yml"
+  shell:
+    """
+      python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1
+    """
+
+# detects short somatic variants (SNVs and indels) using mutect2
+rule detect_short_indels_m2:
     input:
         fasta="resources/refs/genome.fasta",
-        map="results/{sample}/rnaseq/align/realigned.bam"
+        map="results/{sample}/rnaseq/align/{replicate}_realigned.bam"
     output:
-        vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
-        bam="results/{sample}/rnaseq/indel/mutect2/variants.bam",
+        vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf",
+        bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam",
     message:
-        "Detection of somatic SNVs/Indels with Mutect2 on sample {wildcards.sample}"
+      "Detection of somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
     threads: config['threads']
     resources:
         mem_mb=10024,
     params:
         extra="",
     log:
-        "logs/gatk/mutect2/{sample}.log",
+        "logs/{sample}/gatk/mutect2/{replicate}.log",
     wrapper:
         "v1.31.1/bio/gatk/mutect"
 
 
-rule gatk_filtermutectcalls:
+# filters short somatic variants (SNVs and indels) using FilterMutectCalls
+rule filter_short_indels:
     input:
-        vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
-        bam="results/{sample}/rnaseq/indel/mutect2/variants.bam",
+        vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf",
+        bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam",
         ref="resources/refs/genome.fasta",
         # intervals="intervals.bed",
         # contamination="", # from gatk CalculateContamination
         # segmentation="", # from gatk CalculateContamination
         # f1r2="", # from gatk LearnReadOrientationBias
     output:
-        vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf"
+        vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf"
+    message:
+      "Filtering somatic SNVs/Indels with FilterMutectCalls on sample:{wildcards.sample} and replicate:{wildcards.replicate}"
     log:
-        "logs/gatk/filtermutect/{sample}.log",
+        "logs/{sample}/gatk/filtermutect/{replicate}.log",
     params:
         extra="--max-alt-allele-count 3",
         java_opts="",  # optional
@@ -100,15 +129,34 @@ rule gatk_filtermutectcalls:
     wrapper:
         "v1.31.1/bio/gatk/filtermutectcalls"
 
-
-rule gatk_select_SNPs:
+rule combine_short_indels_m2:
+  input:
+    expand("results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf", 
+           sample=config['data']['name'],
+           replicate=config['data']['rnaseq'].keys())
+  output:
+    "results/{sample}/rnaseq/indel/mutect2/variants.vcf" 
+  message:
+    "Combining somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample}"
+  log: 
+    "logs/{sample}/transindel/combine_replicates.log"
+  conda:
+    "../envs/manipulate_vcf.yml"
+  shell:
+    """
+      python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1
+    """
+
+rule select_SNVs_m2:
     input:
-        vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf",
+        vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
         ref="resources/refs/genome.fasta",
     output:
-        vcf="results/{sample}/variants/snvs.vcf"
+        vcf="results/{sample}/rnaseq/indel/snvs.vcf"
+    message:
+      "Selecting somatic SNVs with SelectVariants on sample:{wildcards.sample}"
     log:
-        "logs/gatk/select/{sample}.snvs.log",
+        "logs/{sample}/gatk/select/somatic_snvs.log",
     params:
         extra="--select-type-to-include SNP",  # optional filter arguments, see GATK docs
         java_opts="",  # optional
@@ -117,13 +165,14 @@ rule gatk_select_SNPs:
     wrapper:
         "v1.31.1/bio/gatk/selectvariants"
 
-
-rule gatk_select_Indels:
+rule select_short_indels_m2:
     input:
-        vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf",
+        vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
         ref="resources/refs/genome.fasta",
     output:
-        vcf="results/{sample}/variants/short.indel.vcf"
+        vcf="results/{sample}/rnaseq/indel/short.indel.vcf"
+    message:
+      "Selecting short somatic indels with SelectVariants on sample:{wildcards.sample}"
     log:
         "logs/gatk/select/{sample}.indel.log",
     params:
@@ -134,85 +183,3 @@ rule gatk_select_Indels:
     wrapper:
         "v1.31.1/bio/gatk/selectvariants"
 
-
-
-
-# recalibrate variants (SNP)
-#rule gatk_variant_recal_snp2:
-#    input:
-#        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
-#        ref="resources/refs/genome.fasta",
-#        hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
-#        omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
-#        g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-#        dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
-        
-#    output:
-#        vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
-#        idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx",
-#        tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches"
-#    log:
-#        "logs/gatk/variantrecalibrator/{sample}.log",
-
-#    params:
-#        mode="SNP",  # set mode, must be either SNP, INDEL or BOTH
-#        resources={
-#            "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
-#            "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
-#            "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
-#            "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
-#        },
-#        annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
-#        extra=""
-#    threads: config['threads']
-#    resources:
-#        mem_mb=1024,
-#    wrapper:
-#        "v1.31.1/bio/gatk/variantrecalibrator"
-
-
-
-
-#rule gatk_applybqsr:
-#    input:
-#        bam="results/indel/realign/{sample}.bam",
-#        ref="resources/refs/genome.fasta",
-#        dict="resources/refs/genome.dict",
-#        recal_table="recal/{sample}.grp",
-#    output:
-#        bam="recal/{sample}.bam",
-#    log:
-#        "logs/gatk/gatk_applybqsr/{sample}.log",
-#    params:
-#        extra="",  # optional
-#        java_opts="",  # optional
-#        embed_ref=True,  # embed the reference in cram output
-#    resources:
-#        mem_mb=1024,
-#    wrapper:
-#        "v1.31.1/bio/gatk/applybqsr"
-
-
-
-#rule picard_split_vcfs:
-#    input:
-#        "results/indel/haplotypecaller/{sample}.vcf"
-#    output:
-#        snp="results/indel/{sample}.snp.vcf",
-#        indel="results/indel/{sample}.indel.vcf"
-#    log:
-#        "logs/spltvcfs{sample}.log"
-#    conda:
-#        "../envs/picard.yml"
-#    shell:
-#        """
-#            picard SplitVcfs I={input} \
-#            SNP_OUTPUT={output.snp} \
-#            INDEL_OUTPUT={output.indel}
-#            STRICT=false
-#        """
-
-
-
-
-
diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk
index 7efb4c8..5dbace5 100644
--- a/workflow/rules/preproc.smk
+++ b/workflow/rules/preproc.smk
@@ -1,3 +1,160 @@
+rule trimmomatic_SE:
+  input:
+      unpack(get_raw_reads)
+  output:
+      "results/{sample}/rnaseq/preproc/reads.fq.gz"
+  log:
+      "logs/{sample}/trimmomatic.log"
+  params:
+      trimmer=["TRAILING:3"],
+      extra="",
+      compression_level="-9"
+  threads: config['threads']
+  resources:
+      mem_mb=1024
+  wrapper:
+      "v2.1.1/bio/trimmomatic/se"
+
+#  if rnaseq_readtype == "PE":
+rule trimmomatic_PE:
+  input:
+    unpack(get_raw_reads)
+  output:
+    r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+    r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz",
+    r1_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1_unpaired.fq.gz",
+    r2_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2_unpaired.fq.gz"
+  params:
+    trimmer=[f"MINLEN:{config['preproc']['minlen']}"] 
+      + [f"TRAILING:{config['preproc']['trailing']}" if config['preproc']['trailing'] is not None else ""]
+      + [f"LEADING:{config['preproc']['trailing']}" if config['preproc']['leading'] is not None else ""]
+      + [f"SLIDINGWINDOW:{config['preproc']['slidingwindow']['windowsize']}:{config['preproc']['slidingwindow']['quality']}" if config['preproc']['slidingwindow']['activate'] else ""]
+      + [f"ILLUMINACLIP:{config['preproc']['adapters']}:2:30:10" if config['preproc']['adapters'] is not None else ""],
+      extra=""
+  log:
+    "logs/{sample}/trimmomatic/{replicate}_{seqtype}.log"
+  threads: config['threads']
+  resources:
+      mem_mb=1024
+  wrapper:
+      "v2.1.1/bio/trimmomatic/pe"
+
+rule add_rg_fastq_PE:
+  input:
+#    r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+#    r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz",
+    unpack(get_reads),
+  output:
+    r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r1.fq.gz",
+    r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r2.fq.gz"
+  message:
+    "Adding read group information to fastq files"
+  log:
+    "logs/{sample}/add_rg/{replicate}_{seqtype}.log"
+  conda:
+    "../envs/basic.yml"
+  shell:
+    """
+      bash workflow/scripts/addrgfq.sh {input.r1} > gzip -c - > {output.r1} 2> {log}      
+      bash workflow/scripts/addrgfq.sh {input.r2} > gzip -c - > {output.r2} 2> {log}
+    """
+
+
+
+checkpoint splitfastq:
+  input:
+    unpack(get_splitfastq_input)
+  output:
+    directory("results/{sample}/reads/rnaseq/{replicate}/")
+  log:
+    "logs/{sample}/splitfastq/{replicate}.log"
+  conda:
+    "../envs/splitfastq.yml"
+  threads: 0
+  shell:
+    """
+      python workflow/scripts/splitfastq.py '{input}' {output} 20000000
+    """
+
+rule star_align_fastq:
+  input:
+    fq1 = "results/{sample}/reads/rnaseq/{replicate}/r1/reads_{i}.fq.gz",
+    fq2 = "results/{sample}/reads/rnaseq/{replicate}/r2/reads_{i}.fq.gz",
+    idx = "resources/refs/star/",
+  output:
+    aln = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam",
+    log = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.log",
+    sj = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.tab"
+  log:
+    "logs/star_align/{sample}_{replicate}_{i}.log"
+  params:
+      extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG"
+  threads: config['threads']
+  wrapper:
+      "v1.26.0/bio/star/align"
+
+rule merge_alignment_results_fastq:
+    input:
+      aggregate_alignments_fastq
+    output:
+        "results/{sample}/rnaseq/align/{replicate}_aligned.bam",
+    log:
+        "logs/samtools/merge/{sample}_{replicate}.log",
+    params:
+        extra="",  # optional additional parameters as string
+    threads: config['threads']
+    wrapper:
+        "v1.32.1/bio/samtools/merge"
+
+rule star_align_pe:
+    input:
+        fq1 = "results/{sample}/rnaseq/reads/fastqfiles/r1/inputreads_{i}.fq.gz",
+        fq2 = "results/{sample}/rnaseq/reads/fastqfiles/r2/inputreads_{i}.fq.gz",
+        idx = "resources/refs/star/",
+    output:
+        aln = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam",
+        log = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.log",
+        sj = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.tab"
+    log:
+        "logs/star_align/{sample}_{i}.log"
+    params:
+        extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG"
+    threads: config['threads']
+    wrapper:
+        "v1.26.0/bio/star/align"
+
+rule merge_alignment_results_pe:
+    input:
+        aggregate_alignments_pe
+    output:
+        "results/{sample}/rnaseq/align/aligned.bam",
+    log:
+        "logs/samtools/merge/{sample}.log",
+    params:
+        extra="",  # optional additional parameters as string
+    threads: config['threads']
+    wrapper:
+        "v1.32.1/bio/samtools/merge"
+
+
+  #rule align_with_star_fq:
+    #input:
+        #unpack(get_align_input),
+        #idx="resources/refs/star/"
+    #output:
+        ## see STAR manual for additional output files
+        #aln="results/{sample}/rnaseq/align/aligned.bam",
+        #log="logs/{sample}/star/Log1.out",
+        #sj="results/{sample}/rnaseq/align/sj.out.tab"
+    #log:
+        #"logs/{sample}/star/Log.out"
+    #params:
+        #extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:xxx"
+    #threads: config['threads']
+    #wrapper:
+        #"v2.1.1/bio/star/align"
+
+
 if rnaseq_filetype == ".bam":
     checkpoint split_bamfile_RG:
         input:
@@ -17,7 +174,6 @@ if rnaseq_filetype == ".bam":
                 -h {input} -f {output}/%!.%. {input}
             """
 
-if rnaseq_filetype == ".bam":
     rule bam_to_fastq:
         input:
             "results/{sample}/rnaseq/reads/bamfiles/{readgroup}.bam"
@@ -34,7 +190,6 @@ if rnaseq_filetype == ".bam":
                 | samtools fastq -OT RG -@ {threads} - | gzip -c - > {output}
             """
 
-if rnaseq_filetype == ".bam":
     rule align_with_star:
         input:
             fq1 = "results/{sample}/rnaseq/reads/fastqfiles/{readgroup}.fq.gz",
@@ -51,7 +206,6 @@ if rnaseq_filetype == ".bam":
         wrapper:
             "v1.26.0/bio/star/align"
 
-
 if rnaseq_filetype == ".bam":
     rule merge_alignment_results:
         input:
@@ -69,13 +223,13 @@ if rnaseq_filetype == ".bam":
 
 rule samtools_postproc:
     input:
-        "results/{sample}/rnaseq/align/aligned.bam"
+        "results/{sample}/rnaseq/align/{replicate}_aligned.bam"
     output:
-        "results/{sample}/rnaseq/align/ready.bam"
+        "results/{sample}/rnaseq/align/{replicate}_ready.bam"
     conda:
         "../envs/samtools.yml"
     log:
-        "logs/samtools/postproc/{sample}.log"
+        "logs/samtools/postproc/{sample}_{replicate}.log"
     threads: 6  # more threads brings no significant increase
     shell:
         """ 
@@ -89,11 +243,11 @@ rule samtools_postproc:
 
 rule samtools_postproc_index:
     input:
-        "results/{sample}/rnaseq/align/ready.bam"
+        "results/{sample}/rnaseq/align/{replicate}_ready.bam"
     output:
-        "results/{sample}/rnaseq/align/ready.bam.bai"
+        "results/{sample}/rnaseq/align/{replicate}_ready.bam.bai"
     log:
-        "logs/samtools/index/{sample}.log"
+        "logs/samtools/index/postproc_{sample}_{replicate}.log"
     params:
         extra="",  # optional additional parameters as string
     threads: config['threads']
@@ -102,31 +256,37 @@ rule samtools_postproc_index:
 
 
 # retrieve readgroups from bam file
-if rnaseq_filetype == ".bam":
-    rule determine_readgroups:
-        input:
-            get_rnaseq_data
-        output:
-            "results/{sample}/rnaseq/reads/readgroups.txt"
-        log:
-            "logs/readgroups/{sample}.log"
-        shell:
-            """
-                python workflow/scripts/get_readgroups.py {input} \
-                {output} > {log} 2>&1
-            """
+rule get_readgroups:
+    input:
+      get_readgroups_input
+      #"results/{sample}/rnaseq/align/{replicate}_ready.bam"
+    output:
+        "results/{sample}/rnaseq/reads/{replicate}_readgroups.txt"
+    conda:
+      "../envs/basic.yml"
+    log:
+        "logs/{sample}/get_readgroups/{replicate}.log"
+    shell:
+        """
+            python workflow/scripts/get_readgroups.py '{input}' \
+            {output} > {log} 2>&1
+        """
+
+
+
+
 
 rule realign:
     input:
-        bam="results/{sample}/rnaseq/align/ready.bam",
-        rg="results/{sample}/rnaseq/reads/readgroups.txt"
+        bam="results/{sample}/rnaseq/align/{replicate}_ready.bam",
+        rg="results/{sample}/rnaseq/reads/{replicate}_readgroups.txt"
     output:
-        "results/{sample}/rnaseq/align/realigned.bam"
+        "results/{sample}/rnaseq/align/{replicate}_realigned.bam"
     threads: config['threads']
     shell:
         """
-            samtools collate -Oun128 {input.bam} \
-            | samtools fastq -OT RG,BC - \
+          samtools collate -Oun128 {input.bam} \
+            | samtools fastq -OT RG -@ {threads} - \
             | bwa mem -pt{threads} -CH <(cat {input.rg}) resources/refs/bwa/genome - \
             | samtools sort -@6 -m1g - > {output}
         """
@@ -134,11 +294,11 @@ rule realign:
 
 rule realign_index:
     input:
-        "results/{sample}/rnaseq/align/realigned.bam"
+        "results/{sample}/rnaseq/align/{replicate}_realigned.bam"
     output:
-        "results/{sample}/rnaseq/align/realigned.bam.bai"
+        "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai"
     log:
-        "logs/samtools/index/{sample}.log"
+        "logs/{sample}/realign_index/{sample}_{replicate}.log"
     params:
         extra="",  # optional additional parameters as string
     threads: config['threads']
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 4b7dff2..4c8c120 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -81,3 +81,20 @@ rule bwa_index:
         algorithm="bwtsw",
     wrapper:
         "v1.26.0/bio/bwa/index"
+
+
+rule create_sequence_dictionary:
+    input:
+        "resources/refs/genome.fasta"
+    output:
+        "resources/refs/genome.dict"
+    message:
+      "Create sequence dictionary of reference genome"
+    log:
+        "logs/picard/create_dict.log"
+    params:
+        extra="",  # optional: extra arguments for picard.
+    resources:
+        mem_mb=1024,
+    wrapper:
+        "v1.31.1/bio/picard/createsequencedictionary"