w19_command_line_output-fastq_to_genebycell.txt

# analysis lines for 19W supplemental data

# assumes the following software packages are installed: umi_tools (0.5.4), STAR(2.5.4b), BBTools, samtools(1.6), Subread (1.6.0)

# 1. FIND CELL BARCODE WHITELIST ##################################
umi_tools whitelist --stdin SRR4199325_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=24 \
                    --log2stderr > 25_2-whitelist.txt
umi_tools whitelist --stdin SRR4199326_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=25 \
                    --log2stderr > 26_2-whitelist.txt
umi_tools whitelist --stdin SRR4199327_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=48 \
                    --log2stderr > 27_2-whitelist.txt
umi_tools whitelist --stdin SRR4199328_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=25 \
                    --log2stderr > 28_2-whitelist.txt
umi_tools whitelist --stdin SRR4199329_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=50 \
                    --log2stderr > 29_2-whitelist.txt
umi_tools whitelist --stdin SRR4199330_2.fastq.gz \
                    --bc-pattern=CCCCCCCCNNNNNNNN \
                    --set-cell-number=24 \
                    --log2stderr > 30_2-whitelist.txt

# the pipelines accepts read SRR4199325.1 1/1 and SRR4199325.1 1/2
# where the digit after the / is the fwd/rev pass read 
# this nomenclature doesn't cause hiccups down the line

# formatting the pipelines denies SRR4199330.1.1 1 and SRR4199330.1.2 1
# where the ordering of the 1 and 2 fwd/rev pass nomenclature caused mistakes
# reads in SRR4199328, SRR4199329, SRR4199330 had this issue
# end characters of these reads were trimmed to output SRR4199330.1 and SRR4199330.1 in the fwd and rev read files, respectively

gunzip -c SRR4199328_1.fastq.gz > SRR4199328_1.fastq # unzips the files so they can be modifed
gunzip -c SRR4199328_2.fastq.gz > SRR4199328_2.fastq
gunzip -c SRR4199329_1.fastq.gz > SRR4199329_1.fastq
gunzip -c SRR4199329_2.fastq.gz > SRR4199329_2.fastq
gunzip -c SRR4199330_1.fastq.gz > SRR4199330_1.fastq
gunzip -c SRR4199330_2.fastq.gz > SRR4199330_2.fastq
cat SRR4199328_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199328_1p.fastq # trims appropriate characters from the read name starting with line 1 and every 5th line following
cat SRR4199328_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199328_2p.fastq
cat SRR4199329_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199329_1p.fastq
cat SRR4199329_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199329_2p.fastq
cat SRR4199330_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199330_1p.fastq
cat SRR4199330_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199330_2p.fastq
gzip SRR41993* # recompresses the files for ease of use

# from here down, these commands were batched on files 25,26,27,28,29,30
# I am only showing the command run on set 25 as an example

# 2. EXTRACT CB/UMIS AND FILTER CBS ##################################
umi_tools extract --bc-pattern=CCCCCCCCNNNNNNNN \
                  --stdin SRR4199325_2.fastq.gz \
                  --stdout 25_2_extracted.fastq.gz \
                  --read2-in SRR4199325_1.fastq.gz \
                  --read2-out 25_1_extracted.fastq.gz \
                  --filter-cell-barcode \
                  --whitelist 25_2-whitelist.txt

# quality-trim extracted reads to Q10 using Phred algorithm; qtrim=rl trims from left & right
bbduk.sh -Xmx15g in=25_#_extracted.fastq.gz out=25_#_clean.fastq.gz qtrim=rl trimq=10

# acquire genome alignment files - transcriptome
wget ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.{1..22}.fa.gz
wget ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.{MT,X,Y}.fa.gz
gunzip -c Homo_sapiens.GRCh38.dna.chromosome.* > ensemblGRCh38_r91_unmasked.fa
wget ftp://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz
gunzip Homo_sapiens.GRCh38.91.gtf.gz
mv Homo_sapiens.GRCh38.91.gtf.gz ensemblGRCh38_r91.gtf

# acquire genome alignment files - repeatome
wget ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip hg38.fa.gz
mv hg38.fa hg38_UCSC.fa
# hg38_UCSC_repeatmasker_sorted.gtf - repeatome annotations from repeatmasker hg38.fa.out, converted to a GTF by Ping Ye
grep -v "(*)n" hg38_UCSC_repeatmasker_sorted.gtf > hg38_UCSC_repeatmasker_noSR.gtf # removed the simple repeats
cat hg38_UCSC_repeatmasker_noSR.gtf | sort -k1,1 -k4,4n > hg38_UCSC_repeatmasker_noSR_sorted.gtf 

# generate the genomes to align cleaned/labeled reads with star/2.5.4b
# sjdbOverhang this should be readlength-1, but Alex Dobin says >100 doesn't improve accuracy
# limitGenomeGenerateRAM Alex recommend this, as without specifying this variable, jobs fail
# limitSjdbInsertNsj specifies more precisely the number (collapsed) junctions to be inserted in the genome  
STAR --runThreadN 3 \
--runMode genomeGenerate \
--genomeDir repeatome_hg38_100overhang \
--genomeFastaFiles  UCSChg38.fa \ 
--sjdbGTFfile hg38_UCSC_repeatmasker_sorted.gtf \
--sjdbOverhang 100 \ 
--limitGenomeGenerateRAM 250000000000 \ 
--limitSjdbInsertNsj 6000000 

STAR --runThreadN 3 \
--runMode genomeGenerate \
--genomeDir transcriptome_CRCh38_91_100overhang \
--genomeFastaFiles ensemblGRCh38_r91_unmasked.fa \
--sjdbGTFfile ensemblGRCh38_r91.gtf \
--sjdbOverhang 100 \
--limitGenomeGenerateRAM 25000000000 \
--limitSjdbInsertNsj 3400000

# 3. MAP READS ##################################
# map to transcriptome = txn
STAR --runThreadN 6 \
     --genomeDir transcriptome_CRCh38_91_100overhang/ \
     --readFilesIn 25_1_clean.fastq.gz 25_2_clean.fastq.gz \
     --readFilesCommand zcat \
     --outFilterMultimapNmax 1 \
     --outSAMtype BAM SortedByCoordinate \
     --outFilterScoreMinOverLread 0 \
     --outFilterMatchNminOverLread 0;
# outFilterScoreMinOverLread and outFilterMatchNminOverLread were added because the fastq files were unsorted
# rather than an arduous sort of these 30gb (when zipped) files, I tweaked these parameters.

mv Aligned.sortedByCoord.out.bam 25_txn_aligned.sortedbycoord.out.bam # renames the output bam to distinguish it from the other 12 bams
samtools view 25_txn_aligned.sortedbycoord.out.bam | cut -f 2 | sort | uniq -c # lists the unique sam flags contained in the alignment
samtools view -h -f 0x2 -b 25_txn_aligned.sortedbycoord.out.bam > 25_txn_aligned.sortedbycoord.f2.bam # keeps the header and the reads that are aligned in proper pairs
samtools view 25_txn_aligned.sortedbycoord.f2.bam | cut -f 2 | sort | uniq -c # verifies that the flags maintained are ones which are associated with properly paired reads
# map to repeatome = rptm
STAR --runThreadN 6 \
     --genomeDir repeatome_hg38_100overhang/ \
     --readFilesIn 25_1_clean.fastq.gz 25_2_clean.fastq.gz \
     --readFilesCommand zcat \
     --outFilterMultimapNmax 100 \
     --winAnchorMultimapNmax 100 \
     --outSAMmultNmax 100 \
     --outFilterMismatchNmax 3 \
     --outSAMtype BAM SortedByCoordinate \
     --outFilterScoreMinOverLread 0 \
     --outFilterMatchNminOverLread 0;
# flags outFilterMultimapNmax 100, winAnchorMultimapNmax 100, outSAMmultNmax 100, and outFilterMismatchNmax 3 were modified to maintain multimappers

mv Aligned.sortedByCoord.out.bam 25_rptm_aligned.sortedByCoord.out.bam
samtools view 25_rptm_aligned.sortedByCoord.out.bam | cut -f 2 | sort | uniq -c
samtools view -hf 2 25_rptm_aligned.sortedByCoord.out.bam > 25_rptm_aligned.sortedbycoord.f2.bam # note that multi-mappers are kept
samtools view 25_rptm_aligned.sortedbycoord.f2.bam | cut -f 2 | sort | uniq -c # verifies that the flags maintained are ones which are associated with properly paired reads, single or multimappers

# 4. ASSIGN READS TO GENES ##################################

featureCounts -a ensemblGRCh38_r91.gtf -o 25_genes_assigned -R BAM 25_txn_aligned.sortedbycoord.f2.bam -T 8 -p -g gene_name # outputs an aggressively long names 25_txn_aligned.sortedbycoord.f2.bam.featureCounts.bam
samtools sort 25_txn_aligned.sortedbycoord.f2.bam.featureCounts.bam -o 25_txn_featureassigned_sorted.bam

featureCounts -a hg38_UCSC_repeatmasker_noSR_sorted.gtf -o 25_rptm_assigned -R BAM 25_rptm_aligned.sortedbycoord.f2.bamM -T 8 -p -g gene_id -MO # MO added to continue maintaining multimappers
samtools sort 25_rptm_aligned.sortedbycoord.f2.bam.featureCounts.bam -o 25_rptm_featureassigned_sorted.bam

# 5. COUNT UNIQUE READS PER GENES PER CELL ##################################

umi_tools count --per-gene --gene-tag=XT --per-cell --wide-format-cell-counts -I 25_txn_aligned.sortedbycoord.f2.bam -S 25txn_counts.tsv.gz
umi_tools count --per-gene --gene-tag=XT --per-cell --wide-format-cell-counts -I 25_rptm_featureassigned_sorted.bam -S /scratch/sf040090/UMIcountwide2/25rptm_counts.tsv.gz

#from here, I moved to R