.Rhistory

p1 + geom_segment(aes(y=Sepal.Length,xend=Sepal.Width,yend=pred),
linetype=2,color="Black",alpha=.5)
# look at plot of predictions for each model
# we can do this after using gather_predictions() with more than 1 model
ggplot(df,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.25) +
geom_point(aes(y=pred),color="Black") +
facet_wrap(~model) +
theme_bw()
# which model is best?
# Using mtcars #
skim(mtcars)
carsmod1 <- lm(data=mtcars, formula = mpg ~ wt + factor(cyl))
carsmod2 <- lm(data=mtcars, formula = mpg ~ wt * factor(cyl))
p1 <- add_predictions(mtcars,carsmod1) %>%
ggplot(aes(x=wt,color=factor(cyl))) +
geom_point(aes(y=mpg)) +
geom_smooth(method = "lm",aes(y=pred)) +
labs(title = "wt + cyl")
p2 <- add_predictions(mtcars,carsmod2) %>%
ggplot(aes(x=wt,color=factor(cyl))) +
geom_point(aes(y=mpg)) +
geom_smooth(method = "lm",aes(y=pred)) +
labs(title = "wt * cyl")
p1 / p2
# Cross-validation ####
iris
set <- caret::createDataPartition(iris$Sepal.Length, p=.5) # pick random subset of data
set
set <- caret::createDataPartition(iris$Sepal.Length, p=.5) # pick random subset of data
set
iris
set
set <- set$Resample1 # it saved as a dataframe for some good reason, I'm sure. convert to vector
set
train <- iris[set,] # subset iris using the random row numbers we made
train
test <- iris[-set,] # The other half of the iris dataset
# build our best iris model (mod3, from above)
formula(mod3)
mod3_cv <- lm(data=train, formula = formula(mod3))
# Test trained model on unused other half of data set
iristest <- add_predictions(test,mod3_cv)
iristest
set <- caret::createDataPartition(iris$Sepal.Length, p=.2) # pick random subset of data
set <- set$Resample1 # it saved as a dataframe for some good reason, I'm sure. convert to vector
train <- iris[set,] # subset iris using the random row numbers we made
test <- iris[-set,] # The other half of the iris dataset
set <- caret::createDataPartition(iris$Sepal.Length, p=.8) # pick random subset of data
set <- set$Resample1 # it saved as a dataframe for some good reason, I'm sure. convert to vector
train <- iris[set,] # subset iris using the random row numbers we made
test <- iris[-set,] # The other half of the iris dataset
# plot it
ggplot(iristest,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.25) +
geom_point(aes(y=pred),shape=5)
set.seed(123) # set reproducible random number seed
set <- caret::createDataPartition(iris$Sepal.Length, p=.8) # pick random subset of data
set <- set$Resample1 # it saved as a dataframe for some good reason, I'm sure. convert to vector
train <- iris[set,] # subset iris using the random row numbers we made
test <- iris[-set,] # The other half of the iris dataset
# build our best iris model (mod3, from above)
formula(mod3)
mod3_cv <- lm(data=train, formula = formula(mod3))
# Test trained model on unused other half of data set
iristest <- add_predictions(test,mod3_cv)
# plot it
ggplot(iristest,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.25) +
geom_point(aes(y=pred),shape=5)
# compare MSE from our over-fitted model to the cross-validated one
testedresiduals <- (iristest$pred - iristest$Sepal.Length)
mod3mse # our original MSE
mean(testedresiduals^2) # our cross-validated model
# gather model predictions
df <- gather_predictions(iris, mod3,mod3_cv)
# plot - distinguish model predictions using "linetype"
ggplot(df, aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.2) +
geom_smooth(method = "lm",aes(linetype=model,y=pred)) + theme_bw()
# Typical modeling workflow: ####
# Packages and example data
library(tidyverse)
library(modelr)
library(GGally)
library(lindia)
library(skimr)
library(patchwork)
library(caret)
# Data
data("mtcars")
data("iris")
# Get to know your data ####
mtcars %>% ggpairs()
iris %>% filter(Species == "setosa") %>% ggpairs()
iris %>% ggpairs(mapping = c("Species","Sepal.Length"))
# Build possible models ####
# lm() is linear model. There are LOTS of other model types
mod1 <- lm(data=iris, formula = Sepal.Length ~ Sepal.Width)
mod2 <- lm(data=iris, formula = Sepal.Length ~ Sepal.Width + Species)
mod3 <- lm(data=iris, formula = Sepal.Length ~ Sepal.Width * Species)
# Look at model summaries ####
summary(mod1)
summary(mod2)
summary(mod3)
# how to interpret results?
# Coefficients, P-values, Adjusted R-squared
# Look at model diagnostics ####
gg_diagnose(mod1)
gg_diagnose(mod2)
gg_diagnose(mod3)
# Compare models ####
anova(mod1, mod2) # different?
anova(mod1, mod3)
anova(mod2, mod3)
# which has better fit ?
mod1mse <- mean(residuals(mod1)^2)
mod2mse <- mean(residuals(mod2)^2)
mod3mse <- mean(residuals(mod3)^2)
mod1mse ; mod2mse ; mod3mse
# Evaluate predictions ####
df_mod1 <- add_predictions(iris,mod1) # adds model prediction column using a single model
df_mod1
df <- gather_predictions(iris, mod1,mod2,mod3) # add many models' predictions at once (tidy-style)
df
skim(df)
names(df)
# compare predictions to reality
p1 <- ggplot(df_mod1,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.5,size=2) +
geom_point(aes(y=pred),color="Black") + theme_bw()
p1
p1 + geom_segment(aes(y=Sepal.Length,xend=Sepal.Width,yend=pred),
linetype=2,color="Black",alpha=.5)
# look at plot of predictions for each model
# we can do this after using gather_predictions() with more than 1 model
ggplot(df,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.25) +
geom_point(aes(y=pred),color="Black") +
facet_wrap(~model) +
theme_bw()
# which model is best?
# Using mtcars #
skim(mtcars)
carsmod1 <- lm(data=mtcars, formula = mpg ~ wt + factor(cyl))
carsmod2 <- lm(data=mtcars, formula = mpg ~ wt * factor(cyl))
p1 <- add_predictions(mtcars,carsmod1) %>%
ggplot(aes(x=wt,color=factor(cyl))) +
geom_point(aes(y=mpg)) +
geom_smooth(method = "lm",aes(y=pred)) +
labs(title = "wt + cyl")
p2 <- add_predictions(mtcars,carsmod2) %>%
ggplot(aes(x=wt,color=factor(cyl))) +
geom_point(aes(y=mpg)) +
geom_smooth(method = "lm",aes(y=pred)) +
labs(title = "wt * cyl")
p1 / p2
# Cross-validation ####
# if we train our model on the full data set, it can become "over-trained"
# In other words, we want to make sure our model works for the SYSTEM, not just the data set
set.seed(123) # set reproducible random number seed
set <- caret::createDataPartition(iris$Sepal.Length, p=.5) # pick random subset of data
set <- set$Resample1 # it saved as a dataframe for some good reason, I'm sure. convert to vector
train <- iris[set,] # subset iris using the random row numbers we made
test <- iris[-set,] # The other half of the iris dataset
# build our best iris model (mod3, from above)
formula(mod3)
mod3_cv <- lm(data=train, formula = formula(mod3))
# Test trained model on unused other half of data set
iristest <- add_predictions(test,mod3_cv)
# plot it
ggplot(iristest,aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.25) +
geom_point(aes(y=pred),shape=5)
# compare MSE from our over-fitted model to the cross-validated one
testedresiduals <- (iristest$pred - iristest$Sepal.Length)
mod3mse # our original MSE
mean(testedresiduals^2) # our cross-validated model
# Plot comparison of original and validated model
# gather model predictions
df <- gather_predictions(iris, mod3,mod3_cv)
# plot - distinguish model predictions using "linetype"
ggplot(df, aes(x=Sepal.Width,color=Species)) +
geom_point(aes(y=Sepal.Length),alpha=.2) +
geom_smooth(method = "lm",aes(linetype=model,y=pred)) + theme_bw()
# If it still looks good, you can use the full model for future predictions ####
getwd()
list.files()
list.files(recursive = TRUE)
filenames <- list.files(recursive = TRUE)
filenames <- list.files(recursive = TRUE,pattern = ".fastq")
filenames
filenames[1]
read.lines(filenames[1])
readline(filenames[1])
library(tidyverse)
library(tidyverse)
names(df)
df <- read.csv("./Data/1620_scores.csv")
names(df)
library(tidyverse)
names(df)
# Intro to linear and binomial regressions with single categorical predictor variables
#Packages
library(tidyverse)
library(fitdistrplus)
library(modelr)
library(gridExtra)
# Data
data("mtcars")
df <- mtcars
# quick peek
glimpse(df)
#plot mpg vs disp
p = ggplot(df, aes(x=disp,y=mpg)) +
geom_point()
p
p + geom_smooth(method = "lm", se=FALSE)
# Make linear model of relationship between disp and mpg
mod1 = lm(mpg ~ disp, data = df)
# gather info from model
df$predicted <- predict(mod1)
df$residuals <- residuals(mod1)
p
# plot from above but with model precited values
p2 = p + geom_smooth(method="lm", se=FALSE, alpha=.05, color="lightgrey") +
geom_point(aes(y=df$predicted),color="Red")
p2
# Show residuals
p3 = p2 + geom_segment(aes(xend=disp,yend=df$predicted),linetype=2) +
theme_bw() +
geom_text(x=300,y=30,mapping = aes(label=paste0("Sum of Squares = ",signif(sum(residuals(mod1)^2),4)))) +
geom_text(x=300,y=27.5,mapping = aes(label=paste0("Mean Sq. Deviance = ",signif(mean(residuals(mod1)^2),4))))
p3
# regression for PREDICTION ####
# Make dataframe of new values (must have same column names as in model you want to predict from)
newdata = data.frame(disp = c(50,100,150,200,250,300,350,400,450,500,550,600))
# use new disp values to get predictions from mod1
newpreds = add_predictions(newdata, model = mod1)
p3 + geom_point(newpreds, mapping=aes(x=newpreds$disp,y=newpreds$pred),color="Blue", size=3)
# Cross-validate your model predictions based on your own data ####
# Pick a random half of your data
modrows = sample(nrow(df), replace = FALSE,nrow(df)/2)
# use those random rows: half to build model, other half to test it against real data
mod.df = df[modrows,]
cross.df = df[-modrows,]
# model based on half of data
mod.cross = lm(mpg ~ disp, data = mod.df)
cross.df = add_predictions(cross.df, model = mod.cross, var = "cross.predictions")
mean(abs(cross.df$mpg - cross.df$cross.predictions)) # mean absolute difference btwn predictions and real data
mean(abs(df$mpg - df$predicted)) # for model built on whole data set
p.cross1 = ggplot(cross.df, aes(x=disp,y=mpg)) +
geom_point() + geom_segment(aes(xend=disp,yend=cross.predictions)) +
geom_point(aes(y=cross.predictions),color="Red") + ggtitle("lm: disp")
p.cross1
p.cross1 = p.cross1 + geom_smooth(method = "lm", se=FALSE)
# Add another predictor variable
mod.aov = aov(mpg ~ disp + wt, data = cross.df)
summary(mod.aov)
cross.df = add_predictions(cross.df, model = mod.aov, var = "aov.cross.predictions")
mean(abs(cross.df$mpg - cross.df$aov.cross.predictions)) # mean absolute difference btwn predictions and real data
mean(abs(df$mpg - df$predicted)) # for model built on whole data set
p.cross2 = ggplot(cross.df, aes(x=disp,y=mpg)) +
geom_point() + geom_segment(aes(xend=disp,yend=aov.cross.predictions)) +
geom_point(aes(y=aov.cross.predictions),color="Red") + ggtitle("aov: disp + wt")
p.cross2 = p.cross2 + geom_smooth(method = "lm", se=FALSE)
grid.arrange(p.cross1,p.cross2)
################################################################
# Logistic regression
# Binary dependent variable
# vs	is variable in mtcars dataframe: "V" engine or "Straight" engine (o=V, 1=Straight)
# Can we predict whether a car is automatic or manual based on other data?
ggplot(df, aes(x=hp,y=vs)) + geom_point()
ggplot(df, aes(x=hp,y=vs)) + geom_point() + geom_smooth(method="lm",se=FALSE) +
geom_text(x=250,y=.5,mapping=aes(label="WTF ?!\nLinear model no good for binary response."))
# Try "logistic" regression
mod2 = glm(vs ~ hp, data = df, family = "binomial")
df$binom.pred <- predict(mod2, type = "response")
df$binom.resid <- residuals(mod2, type = "response")
summary(mod2)
exp(coefficients(mod2)) # This gives log-odds: for every 1-unit increase in hp, log-odds of being V-engine drop 0.0685
ggplot(df, aes(x=hp,y=vs)) +
geom_segment(aes(xend=hp,yend=binom.pred), alpha=.5) +
geom_point() +
geom_point(aes(y=binom.pred), shape=22, color="Blue") +
geom_point(data = df %>% filter(vs != round(binom.pred)),
color = "red", size = 2)
data("titanic")
data("Titanic")
# Packages
library(tidyverse)
library(modelr)
# data
data("Titanic")
?data
?Titanic
glimpse(Titanic)
library(GGally)
GGally::ggpairs(Titanic)
df <- Titanic
df <- as.data.frae(Titanic)
GGally::ggpairs(df)
df <- as.data.frae(Titanic)
df <- as.data.frame(Titanic)
GGally::ggpairs(df)
names(df)
ggplot(df, aes(x=Sex,y=Survived)) +
geom_col()
# Convert to T/F
df$Survived
# Convert to T/F
df$Survived[df$Survived == "No"] <- FALSE
df$Survived[df$Survived == "Yes"] <- TRUE
df$Survived
# Convert to T/F
df$Survived <- as.character(df$Survived)
# Packages
library(tidyverse)
library(modelr)
library(GGally)
# data
data("Titanic")
?Titanic
glimpse(Titanic)
# convert to data frame
df <- as.data.frame(Titanic)
# quick look at stuff
GGally::ggpairs(df)
names(df)
# Surivorship
ggplot(df, aes(x=Sex,y=Survived)) +
geom_col()
# Convert to T/F
df$Survived <- as.character(df$Survived)
df$Survived[df$Survived == "No"] <- FALSE
df$Survived[df$Survived == "Yes"] <- TRUE
df$Survived
df
spread(df,key=Survived,value = Freq)
df$Survived
df
spread(df,key=Survived,value = Freq)
sum(df$Freq)
spread(df,key=Survived,value = Freq)
df3 <- spread(df,key=Survived,value = Freq)
df2 <- spread(df,key=Survived,value = Freq)
mutate(df2, Freq = True / sum(df$Freq))
mutate(df2, Freq = TRUE / sum(df$Freq))
sum(df$Freq)
TRUE
mutate(df2, Freq = "TRUE" / sum(df$Freq))
mutate(df2, Freq = df2$TRUE / sum(df$Freq))
df$Survived
df$Survived[df$Survived == "No"] <- False
df$Survived[df$Survived == "Yes"] <- True
df$Survived
df2 <- spread(df,key=Survived,value = Freq)
mutate(df2, Freq = df2$True / sum(df$Freq))
df2
# Packages
library(tidyverse)
library(modelr)
library(GGally)
# data
data("Titanic")
?Titanic
glimpse(Titanic)
# convert to data frame
df <- as.data.frame(Titanic)
# quick look at stuff
GGally::ggpairs(df)
names(df)
# Surivorship
ggplot(df, aes(x=Sex,y=Survived)) +
geom_col()
# Convert to T/F
df$Survived <- as.character(df$Survived)
df$Survived[df$Survived == "No"] <- False
df$Survived[df$Survived == "Yes"] <- True
df$Survived
df2 <- spread(df,key=Survived,value = Freq)
df2
mutate(df2, Freq = df2$Yes / sum(df$Freq))
df2 <- mutate(df2, Freq = df2$Yes / sum(df$Freq))
mod1 <- glm(data = df2, Freq ~ Class + Sex + Age)
summary)mod1
summary(mod1)
mod1 <- glm(data = df2, Freq ~ Class + Sex + Age, family="binomial")
summary(mod1)
add_predictions(df2, mod1)
add_predictions(df2, mod1, type = "response")
# predictions
df3 <- add_predictions(df2, mod1, type = "response")
df3
ggplot(df3, aes(x=Class, y=pred)) + geom_col()
dat <- read.csv("./Data/GradSchool_Admissions.csv",stringsAsFactors = FALSE)
library(tidyverse)
library(GGally)
library(modelr)
ggpairs(dat)
library(tidyverse)
library(modelr)
library(GGally)
dat <- read.csv("./Data/GradSchool_Admissions.csv",stringsAsFactors = FALSE)
View(dat)
ggpairs(dat)
names(dat)
dat$admit[dat$admit == 0]
dat$admit[dat$admit == 0] <- FALSE
dat$admit[dat$admit == 1] <- TRUE
dat
dat$admit
as.logical(dat$admit)
dat$admit <- as.logical(dat$admit)
dat$rank <- factor(dat$rank)
mod1 <- glm(formula = admit ~ gre + gpa + rank,data=dat)
summary(mod1)
mod1 <- glm(formula = admit ~ gre + gpa + rank,data=dat,family = "binomial")
summary(mod1)
add_predictions(dat,mod1,type = "response")
dat2 <- add_predictions(dat,mod1,type = "response")
ggplot(dat2, aes(x=gpa,y=pred)) +
geom_point()
ggplot(dat2, aes(x=gpa,y=pred,color=rank)) +
geom_point()
ggplot(dat2, aes(x=gre,y=pred,color=rank)) +
geom_point()
ggplot(dat2, aes(x=gpa,y=pred,color=rank)) +
geom_smooth(method="lm")
ggplot(dat2, aes(x=gpa,y=pred,color=rank)) +
geom_point()
ggplot(dat,aes(x=gpa,y=admit)) + geom_point()
ggplot(dat,aes(x=gpa,y=admit)) + geom_point()
names(dat)
newdata <- data.frame(gre=c(400,500,600,700),
gpa=c(3.5,2.5,4,3.75),
rank=c("4","4","4","4"))
newdata
add_predictions(newdata,mod1,type = "response")
ggplot(dat,aes(x=gre,y=admit)) + geom_point()
newdata <- data.frame(gre=c(400,500,600,700,800),
gpa=c(3.5,2.5,4,3.75,4),
rank=c("4","4","4","4","1"))
add_predictions(newdata,mod1,type = "response")
library(tidyverse)
source("~/Desktop/GIT_REPOSITORIES/Projects/Chagos/R/palettes.R")
colorblindr::palette_plot(pal.discrete)
colorblindr::palette_plot(pal.discrete[c(6,14)])
dat <- read_csv("~/Desktop/brooklyn_thesis_data.csv")
names(dat)
unique(dat$Location)
dat$Location <- factor(dat$Location,levels=c("B sink","1st WS","1st", "ELE","NDF","SB 074","LS","KEY","SH","BNOD","SB 004b"))
ggplot(dat,aes(x=Location,fill=Result)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(14,6)]) + labs(y="Count",fill="Status")
ggsave("~/Desktop/Nelson_Plot.jpg",dpi=300,height = 6,width = 8)
mostresistance <- as.data.frame(table(dat$Result,dat$Antibiotic)) %>% filter(Var1 != "Susceptible") %>%
arrange(desc(Freq)) %>% select(Var2)
unique(dat$Antibiotic)
mostresistance <- c("Streptomycin", "Ampicillin","Erythromycin", "Tetracycline", "Cefotaxime","Trimethoprim","Meropenem", "Amikacin",
"Ceftaroline","Ceftazidime","Chloramphenicol")
dat$Antibiotic <- factor(dat$Antibiotic,levels = mostresistance)
ggplot(dat,aes(x=Antibiotic,fill=Result)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(14,6)]) + labs(y="Count",fill="Status") +
theme(axis.text.x = element_text(angle=60,hjust=1))
ggsave("~/Desktop/Nelson_Plot_2.jpg",dpi=300,height = 6,width = 8)
# additional info
dat$Bonus = "Susceptible"
res.abs <- as.character(dat$Antibiotic[which(dat$Result == "Resistant")])
dat$Bonus[which(dat$Result == "Resistant")] <- res.abs
unique(dat$Bonus)
dat$Bonus[dat$Bonus == "Susceptible"] <- "None"
dat$Bonus <- factor(dat$Bonus,levels = c("Tetracycline","Erythromycin","Streptomycin",
"Ampicillin","Cefotaxime","Meropenem",
"Trimethoprim","None"))
ggplot(dat,aes(x=Location,fill=Bonus)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(1:5,7,11,29)]) +
labs(y="Count",fill="Antibiotic")
ggsave("~/Desktop/Nelson_Plot_3.jpg",height = 6,width = 8,dpi=300)
ggplot(dat,aes(x=Location,fill=Antibiotic)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(1:5,7,11,29)]) + facet_grid(~Antibiotic)
ggplot(dat,aes(x=Location,fill=Antibiotic)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_grid(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_grid(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location)) +
geom_col() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_grid(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_grid(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_wrap(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location,fill=Result)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete) + facet_wrap(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggplot(dat,aes(x=Location,fill=Result)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(14,6)]) + facet_wrap(~Antibiotic) +
labs(y="Count",fill="Antibiotic")
ggsave("~/Desktop/Nelson_Plot_4.jpg",dpi=300,height=6,width = 10)
ggsave("~/Desktop/Nelson_Plot_4.jpg",dpi=300,height=6,width = 14)
ggsave("~/Desktop/Nelson_Plot_4.jpg",dpi=300,height=6,width = 16)
ggplot(dat,aes(x=Location,fill=Bonus)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(1:5,7,11,29)]) +
labs(y="Count",fill="Antibiotic") + theme(axis.text.x = element_text(angle=60,hjust=1))
ggsave("~/Desktop/Nelson_Plot_3.jpg",height = 6,width = 8,dpi=300)
ggplot(dat,aes(x=Location,fill=Result)) +
geom_bar() + theme_minimal() + scale_fill_manual(values=pal.discrete[c(14,6)]) + facet_wrap(~Antibiotic) +
labs(y="Count",fill="Antibiotic") + theme(axis.text.x = element_text(angle=60,hjust=1))
ggsave("~/Desktop/Nelson_Plot_4.jpg",dpi=300,height=6,width = 16)