Skip to content

Commit

Permalink
INFRA: Verification
Browse files Browse the repository at this point in the history
Statistical tests against external simulators
  • Loading branch information
daikitag committed Mar 6, 2024
1 parent 255539a commit d464469
Show file tree
Hide file tree
Showing 6 changed files with 917 additions and 258 deletions.
99 changes: 0 additions & 99 deletions data/simulate_AlphaSimR.R

This file was deleted.

106 changes: 106 additions & 0 deletions scripts/simulate_AlphaSimR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org')
library(AlphaSimR)
if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org')
library(reticulate)

# This code is used from verification.py to simulate quantitative traits
# by using AlphaSimR.
#
# The basic simulation step is the following:
# 1. Use the tskit Python package through the R package tskit and load the tree
# sequence data as a founder population in AlphaSimR. The codes of this step are
# largely adapted from
# https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R
# 2. Simulate quantitative traits of the founder population in AlphaSimR

# The commandline input has 8 elements
# [num_causal, temporary_directory_name,
# corA, num_trait, h2, h2_2, num_rep, random_seed]

myArgs <- commandArgs(trailingOnly = TRUE)
# Convert to numerics
num_causal <- as.numeric(myArgs[1])
directory_name <- myArgs[2]
corA <- as.numeric(myArgs[3])
num_trait <- as.numeric(myArgs[4])
h2 <- as.numeric(myArgs[5])
h2_2 <- as.numeric(myArgs[6])
num_rep <- as.numeric(myArgs[7])
random_seed <- as.numeric(myArgs[8])

set.seed(random_seed)

tskit <- import("tskit")

tree_filename <- paste0(directory_name,"/tree_seq.tree")
ts <- tskit$load(tree_filename)

sites <- ts$tables$sites$asdict()
pos <- sites$position * 1e-8 # Convert to Morgans
pos <- pos - pos[1] # Set first position to zero

# Extract haplotypes
haplo <- t(ts$genotype_matrix())

# Create an AlphaSimR founder population
founderPop <- newMapPop(genMap=list(pos), haplotypes=list(haplo))

num_ind <- nrow(haplo) / 2

if (num_trait == 1){
mean <- 0
var <- 1
corA <- NULL
H2 <- h2
} else if (num_trait == 2){
mean <- c(0,0)
var <- c(1,1)
corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2)
H2 <- c(h2,h2_2)
}

phenotype_result <- c()
trait_result <- c()

for (i in 1:num_rep) {
SP <- SimParam$
new(founderPop)$
addTraitA(
nQtlPerChr=num_causal,
mean=mean,
var=var,
corA=corA
)$
setVarE(H2=H2)

individuals <- newPop(founderPop)

trait_df <- c()
phenotype_df <- c()

for (trait_id in 1:num_trait){
qtl_site <- SP$traits[[trait_id]]@lociLoc - 1
effect_size <- SP$traits[[trait_id]]@addEff
trait_id_df <- data.frame(
effect_size = effect_size,
site_id = qtl_site,
trait_id = rep(trait_id-1, length(effect_size))
)
trait_df <- rbind(trait_df, trait_id_df)
phenotype <- individuals@pheno[,trait_id]
phenotype_id_df <- data.frame(
phenotype=phenotype,
individual_id = 0:(num_ind-1),
trait_id = rep(trait_id-1, num_ind)
)
phenotype_df <- rbind(phenotype_df, phenotype_id_df)
}
phenotype_result <- rbind(phenotype_result, phenotype_df)
trait_result <- rbind(trait_result, trait_df)
}

phenotype_filename <- paste0(directory_name,"/phenotype_alphasimr.csv")
write.csv(phenotype_result, phenotype_filename, row.names=FALSE)

trait_filename <- paste0(directory_name,"/trait_alphasimr.csv")
write.csv(trait_result, trait_filename, row.names=FALSE)
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org')
library(simplePHENOTYPES)
if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org')
library(reticulate)

# This code is used from verification.py to simulate quantitative traits
# by using simplePHENOTYPES.

# This code loads the vcf file and simulates quantitative traits

# The commandline input has 7 elements
# The commandline input has 6 elements
# [num_causal, num_trait, add_effect, add_effect_2, directory_name,
# vcf_filename, random_seed]
# random_seed]
myArgs <- commandArgs(trailingOnly = TRUE)

num_causal <- as.numeric(myArgs[1])
num_trait <- as.numeric(myArgs[2])
add_effect <- as.numeric(myArgs[3])
add_effect_2 <- as.numeric(myArgs[4])
directory_name <- myArgs[5]
vcf_filename <- myArgs[6]
random_seed <- as.numeric(myArgs[7])
random_seed <- as.numeric(myArgs[6])

if (num_trait == 1){
effect <- add_effect
Expand All @@ -32,7 +29,7 @@ if (num_trait == 1){
}

suppressMessages(create_phenotypes(
geno_file = paste0(directory_name, "/", vcf_filename, ".vcf"),
geno_file = paste0(directory_name, "/tree_seq.vcf"),
add_QTN_num = num_causal,
add_effect = effect,
rep = 1,
Expand Down
70 changes: 70 additions & 0 deletions scripts/simulate_simplePHENOTYPES_qqplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org')
library(simplePHENOTYPES)

# This code is used from verification.py to simulate quantitative traits
# by using simplePHENOTYPES.

# This code loads the vcf file that is generated in `verification.py`
# and uses the effect size from a normal distribution to simulate
# additive traits.

# The commandline input has 7 elements
# [num_causal, h2, directory_name,
# num_rep, mean, var, random_seed]
myArgs <- commandArgs(trailingOnly = TRUE)

num_causal <- as.numeric(myArgs[1])
h2 <- as.numeric(myArgs[2])
directory_name <- myArgs[3]
num_rep <- myArgs[4]
mean <- as.numeric(myArgs[5])
var <- as.numeric(myArgs[6])
random_seed <- as.numeric(myArgs[7])

set.seed(random_seed)

sd <- sqrt(var)

# Function to simulate phenotypes from simplePHENOTYPES
# The effect sizes are simulated from a normal distribution,
# as the geometric series is the only effect size distribution
# supported in simplePHENOTYPES.
simulate_simplePHENOTYPE <- function(
num_causal, random_seed
) {
effect_size <- list(rnorm(n=num_causal, mean=mean, sd=sd))
phenotypes <- suppressMessages(create_phenotypes(
geno_file = paste0(directory_name, "/tree_seq.vcf"),
add_QTN_num = num_causal,
add_effect = effect_size,
rep = 1,
h2 = h2,
model = "A",
seed = random_seed,
vary_QTN = FALSE,
to_r = TRUE,
sim_method = "custom",
quiet = TRUE,
home_dir = directory_name,
verbose = FALSE,
mean = 0
))
# The mean is centered at 0 from simplePHENOTYPES simulation
# so we will divide it by the standard deviation to normalise
# the data
phenotypes[,2] <- phenotypes[,2] / sd(phenotypes[,2])
names(phenotypes)[1:2] <- c("individual_id", "phenotype")
return(phenotypes)
}

phenotype_result <- c()

for (i in 1:num_rep) {
simulated_result <- simulate_simplePHENOTYPE(
num_causal=num_causal, random_seed=random_seed+i
)
phenotype_result <- rbind(phenotype_result, simulated_result)
}

filename = paste0(directory_name, "/simplePHENOTYPES.csv")
write.csv(phenotype_result, filename, row.names=FALSE)
10 changes: 10 additions & 0 deletions scripts/simulation_codes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Simulation Codes

This
This directory is used to store the R simulation codes.

R simulation codes:

- simulate_AlphaSimR.R: This R code uses AlphaSimR to simulate quantitative traits based on a simulated tree sequence data.
- simulate_simplePHENOTYPES_exact.R: This R code uses simplePHENOTYPES to simulate quantitative traits based on a VCF file. This is used for exact tests.
- simulate_simplePHENOTYPES_qqplot.R: This R code uses simplePHENOTYPES to simulate quantitative traits. This is used in qqplot construction.
Loading

0 comments on commit d464469

Please sign in to comment.