-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw19_command_line_output-fastq_to_genebycell.txt
157 lines (135 loc) · 8.85 KB
/
w19_command_line_output-fastq_to_genebycell.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# analysis lines for 19W supplemental data
# assumes the following software packages are installed: umi_tools (0.5.4), STAR(2.5.4b), BBTools, samtools(1.6), Subread (1.6.0)
# 1. FIND CELL BARCODE WHITELIST ##################################
umi_tools whitelist --stdin SRR4199325_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=24 \
--log2stderr > 25_2-whitelist.txt
umi_tools whitelist --stdin SRR4199326_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=25 \
--log2stderr > 26_2-whitelist.txt
umi_tools whitelist --stdin SRR4199327_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=48 \
--log2stderr > 27_2-whitelist.txt
umi_tools whitelist --stdin SRR4199328_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=25 \
--log2stderr > 28_2-whitelist.txt
umi_tools whitelist --stdin SRR4199329_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=50 \
--log2stderr > 29_2-whitelist.txt
umi_tools whitelist --stdin SRR4199330_2.fastq.gz \
--bc-pattern=CCCCCCCCNNNNNNNN \
--set-cell-number=24 \
--log2stderr > 30_2-whitelist.txt
# the pipelines accepts read SRR4199325.1 1/1 and SRR4199325.1 1/2
# where the digit after the / is the fwd/rev pass read
# this nomenclature doesn't cause hiccups down the line
# formatting the pipelines denies SRR4199330.1.1 1 and SRR4199330.1.2 1
# where the ordering of the 1 and 2 fwd/rev pass nomenclature caused mistakes
# reads in SRR4199328, SRR4199329, SRR4199330 had this issue
# end characters of these reads were trimmed to output SRR4199330.1 and SRR4199330.1 in the fwd and rev read files, respectively
gunzip -c SRR4199328_1.fastq.gz > SRR4199328_1.fastq # unzips the files so they can be modifed
gunzip -c SRR4199328_2.fastq.gz > SRR4199328_2.fastq
gunzip -c SRR4199329_1.fastq.gz > SRR4199329_1.fastq
gunzip -c SRR4199329_2.fastq.gz > SRR4199329_2.fastq
gunzip -c SRR4199330_1.fastq.gz > SRR4199330_1.fastq
gunzip -c SRR4199330_2.fastq.gz > SRR4199330_2.fastq
cat SRR4199328_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199328_1p.fastq # trims appropriate characters from the read name starting with line 1 and every 5th line following
cat SRR4199328_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199328_2p.fastq
cat SRR4199329_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199329_1p.fastq
cat SRR4199329_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199329_2p.fastq
cat SRR4199330_1.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199330_1p.fastq
cat SRR4199330_2.fastq | awk '{print $1}' | sed '1~2s/..$//g' > SRR4199330_2p.fastq
gzip SRR41993* # recompresses the files for ease of use
# from here down, these commands were batched on files 25,26,27,28,29,30
# I am only showing the command run on set 25 as an example
# 2. EXTRACT CB/UMIS AND FILTER CBS ##################################
umi_tools extract --bc-pattern=CCCCCCCCNNNNNNNN \
--stdin SRR4199325_2.fastq.gz \
--stdout 25_2_extracted.fastq.gz \
--read2-in SRR4199325_1.fastq.gz \
--read2-out 25_1_extracted.fastq.gz \
--filter-cell-barcode \
--whitelist 25_2-whitelist.txt
# quality-trim extracted reads to Q10 using Phred algorithm; qtrim=rl trims from left & right
bbduk.sh -Xmx15g in=25_#_extracted.fastq.gz out=25_#_clean.fastq.gz qtrim=rl trimq=10
# acquire genome alignment files - transcriptome
wget ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.{1..22}.fa.gz
wget ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.{MT,X,Y}.fa.gz
gunzip -c Homo_sapiens.GRCh38.dna.chromosome.* > ensemblGRCh38_r91_unmasked.fa
wget ftp://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz
gunzip Homo_sapiens.GRCh38.91.gtf.gz
mv Homo_sapiens.GRCh38.91.gtf.gz ensemblGRCh38_r91.gtf
# acquire genome alignment files - repeatome
wget ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip hg38.fa.gz
mv hg38.fa hg38_UCSC.fa
# hg38_UCSC_repeatmasker_sorted.gtf - repeatome annotations from repeatmasker hg38.fa.out, converted to a GTF by Ping Ye
grep -v "(*)n" hg38_UCSC_repeatmasker_sorted.gtf > hg38_UCSC_repeatmasker_noSR.gtf # removed the simple repeats
cat hg38_UCSC_repeatmasker_noSR.gtf | sort -k1,1 -k4,4n > hg38_UCSC_repeatmasker_noSR_sorted.gtf
# generate the genomes to align cleaned/labeled reads with star/2.5.4b
# sjdbOverhang this should be readlength-1, but Alex Dobin says >100 doesn't improve accuracy
# limitGenomeGenerateRAM Alex recommend this, as without specifying this variable, jobs fail
# limitSjdbInsertNsj specifies more precisely the number (collapsed) junctions to be inserted in the genome
STAR --runThreadN 3 \
--runMode genomeGenerate \
--genomeDir repeatome_hg38_100overhang \
--genomeFastaFiles UCSChg38.fa \
--sjdbGTFfile hg38_UCSC_repeatmasker_sorted.gtf \
--sjdbOverhang 100 \
--limitGenomeGenerateRAM 250000000000 \
--limitSjdbInsertNsj 6000000
STAR --runThreadN 3 \
--runMode genomeGenerate \
--genomeDir transcriptome_CRCh38_91_100overhang \
--genomeFastaFiles ensemblGRCh38_r91_unmasked.fa \
--sjdbGTFfile ensemblGRCh38_r91.gtf \
--sjdbOverhang 100 \
--limitGenomeGenerateRAM 25000000000 \
--limitSjdbInsertNsj 3400000
# 3. MAP READS ##################################
# map to transcriptome = txn
STAR --runThreadN 6 \
--genomeDir transcriptome_CRCh38_91_100overhang/ \
--readFilesIn 25_1_clean.fastq.gz 25_2_clean.fastq.gz \
--readFilesCommand zcat \
--outFilterMultimapNmax 1 \
--outSAMtype BAM SortedByCoordinate \
--outFilterScoreMinOverLread 0 \
--outFilterMatchNminOverLread 0;
# outFilterScoreMinOverLread and outFilterMatchNminOverLread were added because the fastq files were unsorted
# rather than an arduous sort of these 30gb (when zipped) files, I tweaked these parameters.
mv Aligned.sortedByCoord.out.bam 25_txn_aligned.sortedbycoord.out.bam # renames the output bam to distinguish it from the other 12 bams
samtools view 25_txn_aligned.sortedbycoord.out.bam | cut -f 2 | sort | uniq -c # lists the unique sam flags contained in the alignment
samtools view -h -f 0x2 -b 25_txn_aligned.sortedbycoord.out.bam > 25_txn_aligned.sortedbycoord.f2.bam # keeps the header and the reads that are aligned in proper pairs
samtools view 25_txn_aligned.sortedbycoord.f2.bam | cut -f 2 | sort | uniq -c # verifies that the flags maintained are ones which are associated with properly paired reads
# map to repeatome = rptm
STAR --runThreadN 6 \
--genomeDir repeatome_hg38_100overhang/ \
--readFilesIn 25_1_clean.fastq.gz 25_2_clean.fastq.gz \
--readFilesCommand zcat \
--outFilterMultimapNmax 100 \
--winAnchorMultimapNmax 100 \
--outSAMmultNmax 100 \
--outFilterMismatchNmax 3 \
--outSAMtype BAM SortedByCoordinate \
--outFilterScoreMinOverLread 0 \
--outFilterMatchNminOverLread 0;
# flags outFilterMultimapNmax 100, winAnchorMultimapNmax 100, outSAMmultNmax 100, and outFilterMismatchNmax 3 were modified to maintain multimappers
mv Aligned.sortedByCoord.out.bam 25_rptm_aligned.sortedByCoord.out.bam
samtools view 25_rptm_aligned.sortedByCoord.out.bam | cut -f 2 | sort | uniq -c
samtools view -hf 2 25_rptm_aligned.sortedByCoord.out.bam > 25_rptm_aligned.sortedbycoord.f2.bam # note that multi-mappers are kept
samtools view 25_rptm_aligned.sortedbycoord.f2.bam | cut -f 2 | sort | uniq -c # verifies that the flags maintained are ones which are associated with properly paired reads, single or multimappers
# 4. ASSIGN READS TO GENES ##################################
featureCounts -a ensemblGRCh38_r91.gtf -o 25_genes_assigned -R BAM 25_txn_aligned.sortedbycoord.f2.bam -T 8 -p -g gene_name # outputs an aggressively long names 25_txn_aligned.sortedbycoord.f2.bam.featureCounts.bam
samtools sort 25_txn_aligned.sortedbycoord.f2.bam.featureCounts.bam -o 25_txn_featureassigned_sorted.bam
featureCounts -a hg38_UCSC_repeatmasker_noSR_sorted.gtf -o 25_rptm_assigned -R BAM 25_rptm_aligned.sortedbycoord.f2.bamM -T 8 -p -g gene_id -MO # MO added to continue maintaining multimappers
samtools sort 25_rptm_aligned.sortedbycoord.f2.bam.featureCounts.bam -o 25_rptm_featureassigned_sorted.bam
# 5. COUNT UNIQUE READS PER GENES PER CELL ##################################
umi_tools count --per-gene --gene-tag=XT --per-cell --wide-format-cell-counts -I 25_txn_aligned.sortedbycoord.f2.bam -S 25txn_counts.tsv.gz
umi_tools count --per-gene --gene-tag=XT --per-cell --wide-format-cell-counts -I 25_rptm_featureassigned_sorted.bam -S /scratch/sf040090/UMIcountwide2/25rptm_counts.tsv.gz
#from here, I moved to R