4_basic_analysis.Rmd

---
title: "4_basic_analysis"
author: "Bill Chung"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(gridExtra)
library(gapminder)
library(gapminder)
library(dplyr)
library(patchwork)
```

## Basic analysis

- see chapter 7 of the book by Dr.Kwon

```{r}
par(mfrow=c(1,1))
mpg <- tbl_df(mpg)
mpg

## quantative data
summary(mpg$hwy)
mean(mpg$hwy)
median(mpg$hwy)
range(mpg$hwy)
quantile(mpg$hwy)
```


# t-test

```{r}
#7.3.1
hwy <- mpg$hwy
n <- length(hwy)
mu0 <- 22.9
#one tail
t.test(hwy, mu=mu0, alternative = "greater")
#two tail, this is the default case
#unless you have pretty good reason, avoid one tail
t.test(hwy)
```

# Categorical value

```{r}
set.seed(1606)
n <- 100
p <- 0.5
x <- rbinom(n, 1, p)
x <- factor(x, levels = c(0,1), labels = c("no", "yes"))
x

table(x)

prop.table(table(x))

barplot(table(x))

binom.test(x=length(x[x=='yes']), n = length(x), p = 0.5, alternative = "two.sided")


binom.test(x=5400, n = 10000)


n <- c(100, 1000, 2000, 10000, 1e6)
data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')
grid()

n <- c(100, 1000, 2000, 10000, 1e6)
data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')

```


```{r}
# 7.6. 수량형 X, 수량형 Y의 분석

ggplot(mpg, aes(cty, hwy)) + geom_jitter() + geom_smooth(method="lm")
cor(mpg$cty, mpg$hwy)
with(mpg, cor(cty, hwy))
with(mpg, cor(cty, hwy, method = "kendall"))
with(mpg, cor(cty, hwy, method = "spearman"))
```

```{r}
# 7.6.3. 선형회귀모형 적합

(hwy_lm <- lm(hwy ~ cty, data=mpg))
summary(hwy_lm)

predict(hwy_lm)
resid(hwy_lm)
predict(hwy_lm, newdata = data.frame(cty=c(10, 20, 30)))


opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(hwy_lm, las = 1)   # Residuals, Fitted, ...


```


```{r}
# 7.6.6. 로버스트 선형 회귀분석
library(MASS)
set.seed(123) # make reproducible
lqs(stack.loss ~ ., data = stackloss) # 로버스트
lm(stack.loss ~ ., data = stackloss) # 보통 선형모형
```


```{r}
# 7.6.7. 비선형/비모수적 방법, 평활법과 LOESS

plot(hwy ~ displ, data=mpg)
mpg_lo <- loess(hwy ~ displ, data=mpg)
mpg_lo
summary(mpg_lo)
xs <- seq(2,7,length.out = 100)
mpg_pre <- predict(mpg_lo, newdata=data.frame(displ=xs), se=TRUE)
lines(xs, mpg_pre$fit)
lines(xs, mpg_pre$fit - 1.96*mpg_pre$se.fit, lty=2)
lines(xs, mpg_pre$fit + 1.96*mpg_pre$se.fit, lty=2)

ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  geom_smooth()

```


```{r}
# 7.7. 범주형 x, 수량형 y

mpg %>% ggplot(aes(class, hwy)) + geom_boxplot()

(hwy_lm2 <- lm(hwy ~ class, data=mpg))
summary(hwy_lm2)


predict(hwy_lm2, newdata=data.frame(class="pickup"))

opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(hwy_lm2, las = 1)    # Residuals, Fitted, ...
par(opar)
```

```{r}
# 7.8. 수량형 x, 범주형 y (성공-실패)

library(gridExtra)
p1 <- ggplot(data.frame(x=c(0, 1)), aes(x)) +
  stat_function(fun=function(x) log(x/(1-x))) + ylab('logit(x)') +
  ggtitle("Logit function")
p2 <- ggplot(data.frame(y=c(-6, 6)), aes(y)) +
  stat_function(fun=function(y) 1/(1+exp(-y))) + ylab('logistic(y)') +
  ggtitle("Logistic function")

p1 | p2
```

# Challenger

```{r}
chall <- read.csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/challenger.csv')
chall <- tbl_df(chall)
glimpse(chall)


p1 <- chall %>% ggplot(aes(temperature, distress_ct)) +
  geom_point()
p2 <- chall %>% ggplot(aes(factor(distress_ct), temperature)) +
  geom_boxplot()

p1|p2


(chall_glm <-
    glm(cbind(distress_ct, o_ring_ct - distress_ct) ~
        temperature, data=chall, family='binomial'))

summary(chall_glm)

predict(chall_glm, data.frame(temperature=30))

exp(3.45) / (exp(3.45) +1)
predict(chall_glm, data.frame(temperature=30), type='response')


logistic <- function(x){exp(x)/(exp(x)+1)}

plot(c(20,85), c(0,1), type = "n", xlab = "temperature",
     ylab = "prob")
tp <- seq(20, 85, 1)
chall_glm_pred <-
  predict(chall_glm,
          data.frame(temperature = tp),
          se.fit = TRUE)
lines(tp, logistic(chall_glm_pred$fit))
lines(tp, logistic(chall_glm_pred$fit - 1.96 * chall_glm_pred$se.fit), lty=2)
lines(tp, logistic(chall_glm_pred$fit + 1.96 * chall_glm_pred$se.fit), lty=2)
abline(v=30, lty=2, col='blue')

```