-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsim_Y.r
145 lines (103 loc) · 3.79 KB
/
sim_Y.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
### Function to simulate traits with causative markers for GWAS
# need genomic data . A sample of 22k random SNPs from the Arabidopsis 1001G Projekt is available as example for Arabidopsis traits
# a file with the Information of all 2029 accessions for which genotype data exist is also provided (lat_long_2029.csv). This information has been downloaded from the AraPheno database
#A<-read.csv('~/git/GWAS/data/lat_long_2029.csv')
#If you use own data ensure that the id should be in a column called 'accession_id'
#load('~/git/GWAS/data/X_Mix.rda')
#rownames(X) needs to be the ids of the accessions
# for models with multiple causative markers, it is recommended to have them orderd by the amount of variance explained (ve)
# the simulation create random, but reproducible (seed) phenotypes.
# to simulate only in specific accession provide an integer of accessions ids in sp_acc (length need to be >1).
# to siumulate distinct markers, the script can be modified manually.
sim_Y<-function(n=100,acc=A,sp_acc=0,no_acc=200,fix_acc=TRUE,SNPs=X,no_snps=1,ve=.1,mac=5,h2=0.7,seed=42,bk=1000) {
stopifnot(no_snps>0)
stopifnot(length(ve)==no_snps)
set.seed(seed)
Sim<-list()
Caus<-list()
if (length(sp_acc)>1){
a<-sp_acc
no_acc=length(a)
}else {
a<-sample(acc$accession_id,no_acc) }
X_<-SNPs[rownames(SNPs)%in%a,]
af<-apply(X_,2,sum)
X_ok<-X_[,which(af>mac&af<(no_acc-mac))]
u<-1
# set the seed
set.seed(seed+u)
caus<-X_ok[,sample(1:ncol(X_ok),(no_snps+1))]
#generating polygenic background
X3<-X_ok[,!colnames(X_ok)%in%colnames(caus)]
back<-X3[,sample(1:ncol(X3),bk)]
betas<-rnorm(bk,mean=0,sd=0.1)
first<- back %*% betas
### adding genetic background to data
sim<-data.frame(ecot_id=as.integer(rownames(back)),value=first)
### set heritability
dat<-var(sim[,2])
h_2<-dat/h2-dat
fix1<-rnorm(nrow(back),0,sqrt(h_2))
sim_<-data.frame(ecot_id=as.integer(rownames(back)),value=first+fix1)
for ( t in 1:length(ve)) {
beta<-sqrt((ve[t]/(1-ve[t]))*(var(sim_[,2])/var(caus[,t])))
cand<-beta*caus[,t]
sim_$value<-sim_$value+cand
}
Sim[[u]]<-sim_
Caus[[u]]<-colnames(caus)[1:t]
if (fix_acc==FALSE) {
for ( u in 2:n ) {
set.seed(seed+u)
a<-sample(A$accession_id,no_acc)
X_<-subset(X,rownames(X)%in%a)
af<-apply(X_,2,sum)
X_ok<-X_[,which(af>mac&af<(no_acc-mac))]
caus<-X_ok[,sample(1:ncol(X_ok),(no_snps+1))]
#generating polygenic background
X3<-X_ok[,!colnames(X_ok)%in%colnames(caus)]
back<-X3[,sample(1:ncol(X3),bk)]
betas<-rnorm(bk,mean=0,sd=0.1)
first<- back %*% betas
### adding genetic background to data
sim<-data.frame(ecot_id=as.integer(rownames(back)),value=first)
### set heritability
dat<-var(sim[,2])
h_2<-dat/h2-dat
fix1<-rnorm(nrow(back),0,sqrt(h_2))
sim_<-data.frame(ecot_id=as.integer(rownames(back)),value=first+fix1)
for ( t in 1:length(ve)) {
beta<-sqrt((ve[t]/(1-ve[t]))*(var(sim_[,2])/var(caus[,t])))
cand<-beta*caus[,t]
sim_$value<-sim_$value+cand
}
Sim[[u]]<-sim_
Caus[[u]]<-colnames(caus)[1:t]
}
}else {
for ( u in 2:n ) {
set.seed(seed+u)
caus<-X_ok[,sample(1:ncol(X_ok),(no_snps+1))]
#generating polygenic background
X3<-X_ok[,!colnames(X_ok)%in%colnames(caus)]
back<-X3[,sample(1:ncol(X3),bk)]
betas<-rnorm(bk,mean=0,sd=0.1)
first<- back %*% betas
### adding genetic background to data
sim<-data.frame(ecot_id=as.integer(rownames(back)),value=first)
### set heritability
dat<-var(sim[,2])
h_2<-dat/h2-dat
fix1<-rnorm(nrow(back),0,sqrt(h_2))
sim_<-data.frame(ecot_id=as.integer(rownames(back)),value=first+fix1)
for ( t in 1:length(ve)) {
beta<-sqrt((ve[t]/(1-ve[t]))*(var(sim_[,2])/var(caus[,t])))
cand<-beta*caus[,t]
sim_$value<-sim_$value+cand
}
Sim[[u]]<-sim_
Caus[[u]]<-colnames(caus)[1:t]
}
}
return(list(Y=Sim,Caus=Caus))
}