forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPA1_template.Rmd
164 lines (138 loc) · 7.06 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
---
title: "Module4_Reproducible_Research"
date: "December 16, 2016"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Week2 Project
### Below are the objectives of the project:-
1.Histogram of the total number of steps taken each day
2.Mean and median number of steps taken each day
3.Time series plot of the average number of steps taken
4.The 5-minute interval that, on average, contains the maximum number of steps
5.Show strategy for imputing missing data
6.Histogram of the total number of steps taken each day after missing values are imputed
7.Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends
#### General processing
```{r results='hide', message=FALSE, warning=FALSE}
library(Hmisc)
library(dplyr)
library(ggplot2)
library(plyr)
library(gridExtra)
setwd("C:/Users/212446591/Desktop/coursera/Assignment/5. Reproducible Research/Week2/Project")
csv <- read.csv("activity.csv")
new_df<-tbl_df(csv)
```
#### Objective 1: Histogram of the total number of steps taken each day
```{r objective 1}
## Adding weekday/end definition
new_df <- mutate(new_df,day = ifelse(weekdays(as.Date(date)) %in% c("Saturday","Sunday"),"weekend","weekday"))
## Aggregate data frame by date
TotalSteps<-aggregate(new_df$steps, by=list(Day=new_df$date), FUN=sum,na.rm=TRUE)
# 0 values are removed from histogram
TotalSteps <- filter(TotalSteps,x != 0)
## Plotting
p1<-ggplot(data=TotalSteps, aes(TotalSteps$x)) +
geom_histogram(col="red",binwidth=2000,aes(fill=..count..,y=..density..) ) +
scale_fill_gradient("Count", low = "green", high = "red") +
xlab("Total Steps") + ylab("Density") + geom_density() + ggtitle("Histogram of the total number of steps taken by day")
print(p1)
```
#### Objective 2: Mean and median number of steps taken each day
```{r objective 2}
## Summarize mean & Median by date
ddply(new_df, .(date), summarize, mean=mean(steps,na.rm=TRUE), median=median(steps))
```
#### Objective 3: Time series plot of the average number of steps taken
```{r objective 3, fig.width=12}
## Aggregate data frame by interval and find mean & Max
TotalSteps_by_Interval<-ddply(new_df, .(interval), summarize,Mean=mean(steps,na.rm=TRUE),Max=max(steps,na.rm=TRUE))
## Find data which has maximum mean across days by interval
max_Mean_interval<-TotalSteps_by_Interval[which.max(TotalSteps_by_Interval$Mean),]$interval
## Plotting
ggplot(data = TotalSteps_by_Interval, aes(x = interval, y = Mean)) +
geom_line() + geom_point(size=1, shape=21, fill="red") +
xlab("Interval") + ylab("Mean Steps") +
scale_x_continuous(breaks=seq(0,2355,100)) + ggtitle("Time series plot of the average number of steps by interval") +
geom_vline(aes(xintercept=max_Mean_interval), colour="#00FF00", linetype="dashed") +
geom_text(aes(max_Mean_interval, 0, label = max_Mean_interval, hjust = 1), size = 2)
```
#### Objective 4: The 5-minute interval that, on average, contains the maximum number of steps
```{r objective 4, fig.width=12}
## Find data which has maximum steps across days by interval
max_interval<-TotalSteps_by_Interval[which.max(TotalSteps_by_Interval$Max),]$interval
## Plotting
ggplot(data = TotalSteps_by_Interval, aes(x = interval, y = Max)) +
geom_line()+ geom_point(size=1, shape=21, fill="red") +
xlab("Interval") + ylab("Max Steps") +
scale_x_continuous(breaks=seq(0,2355,100)) + ggtitle("Time series plot of the maximum number of steps by interval") +
geom_vline(aes(xintercept=max_interval), colour="#00FF00", linetype="dashed") +
geom_text(aes(max_interval, 0, label = max_interval, hjust = 1), size = 2)
```
#### Objective 5: Show strategy for imputing missing
There are __`r ((nrow(new_df) - sum(complete.cases(new_df))) / nrow(new_df)) * 100 `__ % missing values of steps in complete dateset.
#####Strategy for imputing data:
1. Divide the dataset by group of interval
2. Find the mean of that group of interval
3. Substituting NAs with mean of that group
##### Comparing Lineplots of Mean by Interval
```{r objective 5, fig.width=12}
## Imputing missing steps values using Hmisc pacakge
new_df<-ddply(new_df, .(interval), transform, imputed_steps = impute(steps,mean))
## Mean and median after imputing. Not printed on the markdown
Mean_Median_Steps1<-ddply(new_df, .(date), summarize, mean=mean(imputed_steps), median=median(imputed_steps))
## Aggregate data frame by interval and find mean for imputed steps
TotalSteps_by_Interval1<-ddply(new_df, .(interval), summarize, Mean=mean(imputed_steps))
## Plotting
p3<-ggplot(data = TotalSteps_by_Interval, aes(x = interval, y = Mean)) +
geom_line()+ geom_point(size=1, shape=21, fill="red") +
xlab("Interval") + ylab("Mean Steps") + ggtitle("Time series plot of steps by interval") +scale_x_continuous(breaks=seq(0,2355,100))
p4 <-ggplot(data = TotalSteps_by_Interval1, aes(x = interval, y = Mean)) +
geom_line()+ geom_point(size=1, shape=21, fill="yellow") +
xlab("Interval") + ylab("Mean Imputed Steps") + scale_x_continuous(breaks=seq(0,2355,100))
## comparing side by side
grid.arrange(p3, p4, nrow=2)
```
_Above plot shows that imputed values fit perfectly._
#### Objective 6: Comparing Histograms
```{r objective 6, fig.width=12}
## Aggregate data frame by date
TotalSteps_imputed<-aggregate(new_df$imputed_steps, by=list(Day=new_df$date), FUN=sum)
## plotting with imputed values
p2 <- ggplot(data=TotalSteps_imputed, aes(TotalSteps_imputed$x)) +
geom_histogram(col="red",binwidth=2000,aes(fill=..count..,y=..density..) ) +
scale_fill_gradient("Count", low = "green", high = "red") +
xlab("Total Imputed Steps") + ylab("Density") + geom_density()
## comparing side by side
grid.arrange(p1, p2, nrow=2)
```
#### Objective 7: Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends
```{r objective 7, fig.width=12}
## find data of average imputed steps by interval for all weekday
wkday_TotalSteps_by_Interval <- ddply(filter(new_df,day=="weekday"), .(interval), summarize, Mean=mean(imputed_steps))
wkday_TotalSteps_by_Interval$day = "weekday"
## find data of average imputed steps by interval for all weekend
wendday_TotalSteps_by_Interval<-ddply(filter(new_df,day=="weekend"), .(interval), summarize, Mean=mean(imputed_steps))
wendday_TotalSteps_by_Interval$day = "weekend"
## row level binding
new_df1 <- rbind(wkday_TotalSteps_by_Interval,wendday_TotalSteps_by_Interval)
## plotting
ggplot(data = new_df1, aes(x = interval, y = Mean,color=day)) + geom_line()+
geom_point(size=1, shape=21, fill="white") + xlab("Interval") + ylab("Mean Steps") + facet_grid(day~ .) + ggtitle("Mean steps by interval") + scale_x_continuous(breaks=seq(0,2355,250))
```
#### Cleanup
```{r results='hide', message=FALSE, warning=FALSE}
rm(new_df1)
rm(new_df)
rm(TotalSteps_by_Interval)
rm(TotalSteps_by_Interval1)
rm(wkday_TotalSteps_by_Interval)
rm(wendday_TotalSteps_by_Interval)
rm(TotalSteps_imputed)
rm(p1)
rm(p2)
rm(p3)
rm(p4)