-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06.eQTL_SV.sh
136 lines (132 loc) · 5.77 KB
/
06.eQTL_SV.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
### eQTL mapping
# RNAseq
fastq-dump ${SRAnumber} --gzip --split-3 -O ./data
gzip data/${SAMPLE}_1.fastq
gzip data/${SAMPLE}_2.fastq
# fastp
fastp -i ${SAMPLE}_1.fastq.gz -I ${SAMPLE}_2.fastq.gz -o ${SAMPLE}_1.fastqc.gz -O ${SAMPLE}_2.fastqc.gz -w 16
# STAR index
STAR --runThreadN 20 --runMode genomeGenerate --genomeDir star \
--genomeFastaFiles Sus_scrofa.Sscrofa11.1.dna.toplevel.fa \
--sjdbGTFfile Sus_scrofa.Sscrofa11.1.109.gtf --sjdbOverhang 149 &
# STAR mapping
STAR --runThreadN 20 --genomeDir star --quantMode TranscriptomeSAM --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within \
--readFilesCommand zcat --outFilterMismatchNmax 3 --readFilesIn ${SAMPLE}_1.fastqc.gz ${SAMPLE}_2.fastqc.gz --outFileNamePrefix ${SAMPLE}
# stringtie
stringtie -p 20 -e -B -G Sus_scrofa.Sscrofa11.1.109.gtf -o ${SAMPLE}.gtf -A ${SAMPLE}.tsv ${SAMPLE}Aligned.sortedByCoord.out.bam
# featurecounts
featureCounts -T 20 -p -t exon -g gene_id -a Sus_scrofa.Sscrofa11.1.109.gtf -o ${SAMPLE}.featureCounts.txt ${SAMPLE}Aligned.sortedByCoord.out.bam
# preprocess for eQTL mapping
# TSS position
python{
from io1 import gtf_to_tss_bed
annotation_gtf = "Sus_scrofa.Sscrofa11.1.109.gtf"
tss = gtf_to_tss_bed(annotation_gtf, feature='gene', exclude_chrs=[], phenotype_id='gene_id')
tss.to_csv("Sus11TSS.bed", index=False, sep="\t")
}
# preprocess for expression data
R{
library(edgeR)
library(RNOmni)
# remove samples with RNA, without WGS
snp.sample = scan("../../genotype/SNP.sample", what="character")
info = read.table("../../preprocess/sampleInfo.txt", header=T)
all.sample = unique(info$wgsID)
miss.sample = setdiff(all.sample, snp.sample)
info1 = info[info$wgsID != miss.sample, ]
tissue = "M"
featuerCounts = read.table("../featureCounts.csv", header=T, sep=",")
tpm = read.table("../tpm_matirx.csv", header=T, sep=",")
if (!identical(featuerCounts[,1], tpm[,1])){print("######## ERROR ##########")}
# 2. remove samples without WGS
sample.info = read.table("../../preprocess/sampleInfo.txt", header=T)
rnaID.rm = sample.info[sample.info[,4] == "SAMEA111521213",1]
featuerCounts = featuerCounts[,!(colnames(featuerCounts) %in% rnaID.rm)]
tpm = tpm[,!(colnames(tpm) %in% rnaID.rm)]
sample.info = sample.info[sample.info[,4] != "SAMEA111521213",]
# 3. extract WGS according the samples in RNA
sample.tmp = sample.info[sample.info$tissue==tissue,]
rownames(sample.tmp) = sample.tmp[,4]
rnaID = sample.tmp[snp.sample,1]
featuerCounts.temp = featuerCounts[,c("GeneID", rnaID)]
tpm.temp = tpm[,c("GeneID",rnaID)]
# 4. replace sampleID in RNAseq
rnaID2wgsID = c(sample.tmp[,4], "GeneID")
names(rnaID2wgsID) = c(sample.tmp[,1], "GeneID")
colnames(featuerCounts.temp) = rnaID2wgsID[colnames(featuerCounts.temp)]
colnames(tpm.temp) = rnaID2wgsID[colnames(tpm.temp)]
rownames(featuerCounts.temp) = featuerCounts.temp[,1]
featuerCounts.temp = featuerCounts.temp[,2:ncol(featuerCounts.temp)]
rownames(tpm.temp) = tpm.temp[,1]
tpm.temp = tpm.temp[,2:ncol(tpm.temp)]
# 5. gene filter
count_threshold = 6
tpm_threshold = 0.1
sample_frac_threshold = 0.2
nsamples = ncol(tpm.temp)
tpm_th = rowSums(tpm.temp >= tpm_threshold)
count_th = rowSums(featuerCounts.temp >= count_threshold)
ctrl1 = tpm_th >= (sample_frac_threshold * nsamples)
ctrl2 = count_th >= (sample_frac_threshold * nsamples)
mask = ctrl1 & ctrl2
# 6. genrate TMM exp matrix
expr = DGEList(counts=featuerCounts.temp)
y = calcNormFactors(expr, method="TMM")
TMM = cpm(y,normalized.lib.sizes=T)
TMM_pass = TMM[mask,]
#expression values (TMM) were inverse normal transformed across samples.
inverse_normal_transform = function(x) {
qnorm(rank(x) / (length(x)+1))
}
TMM_inv = t(apply(TMM_pass, MARGIN = 1, FUN = inverse_normal_transform))
# 7. genrate bed file for QTLtools
tss = read.table("../Sus11TSS.bed", header=T)
tss = tss[tss$gene_id %in% rownames(TMM_inv),]
colnames(tss) = c("#Chr", "start", "end", "pid")
tss$gid = "."
tss$strand = "+"
gene_expr = TMM_inv[tss$pid,]
bed.out = cbind(tss, gene_expr)
write.table(bed.out, paste0("tissue",tissue,".bed"), row.names=F, sep="\t", quote=F)
### COV file
cov.info = read.table("../preprocess/eQTLsampleInfo.txt", header=F)
sample2id = sample.tmp[,4]
names(sample2id) = sample.tmp[,2]
cov.info[,1] = sample2id[cov.info[,1]]
cov.out = rbind(c("id", "month", "sex", "breed"), cov.info)
colnames(cov.info) = c("id", "month", "sex", "breed")
cov.out = t(cov.out)
write.table(cov.out, "cov.txt", row.names=F, sep="\t", quote=F, col.names=F)
}
# add PCs from SNPs
plink2 --vcf SNP.FAANG.filter.vcf.gz --make-bed --out SNP
gcta64 --bfile SNP --autosome-num 18 --make-grm --make-grm-alg 1 --out kinship
gcta64 --grm kinship --pca 20 --out pc
# QTLtools for eQTL mapping
# permutation
cov=cov.pc.txt
vcf=merged.FAANG.vcf.gz
bed=tissueM.bed.gz
# permutation
for j in $(seq 1 10)
do
echo "QTLtools cis --vcf ${vcf} --bed ${bed} --cov ${cov} \
--permute 1000 --normal --seed 123456 --silent --chunk $j 10 --log M_permute_${j}.SV.log --out M_permute_${j}.SV.txt &" >> M_permutate.sh
done
nohup bash M_permutate.sh &> M_permutate.log &
# FDR correction
cat M_permute_*.SV.txt | gzip -c > M_permute.txt.gz
Rscript runFDR_cis.R M_permute.txt.gz 0.05 M_permute_all
# conditional pass
for j in $(seq 1 10)
do
echo "QTLtools cis --vcf ${vcf} --bed ${bed} --cov ${cov} \
--mapping M_permute_all.thresholds.txt --silent --chunk $j 10 --log M_conditional_${j}.log --out M_conditional_${j}.txt &" >> M_condition.sh
done
bash M_condition.sh &> M_condition.log &
cat M_conditional_*.txt > M_conditional_full.txt
# normal pass
QTLtools cis --vcf ${vcf} --bed ${bed} --cov ${cov}\
--nominal 1 --normal --silent --log M_nominals.log --out M_nominals.txt &
# extract significant variants in normal pass
python SummyNominalPass.py -n M_nominals.txt -t M_permute_all.thresholds.txt -p M_nominal &