forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PA1_template.Rmd
166 lines (134 loc) · 5.84 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Reproducible Research: Peer Assessment 1
```{r import}
library(knitr)
library(ggplot2)
library(lattice)
library(scales)
library(plyr)
options(digits=10)
```
## Setup `knitr` options.
```{r setoptions}
opts_chunk$set(message = FALSE, echo=TRUE, results='hide',
fig.width = 6, fig.height = 6)
```
## Loading and preprocessing the data
```{r loaddata}
# Load the data
activity <- read.csv(unzip('activity.zip'), stringsAsFactors=F)
# Transform the dates to Date format
activity <- transform(activity, date = as.Date(date))
stopifnot(class(activity$date)=="Date")
head(activity)
# Check that the number of unique intervals corresponds to a
# full 24h of 5 minute intervals
nUniqueIntervals <- length(unique(activity$interval))
intervalDuration <- 5
stopifnot(intervalDuration == 24*60/nUniqueIntervals)
```
The activity dataset has `r nrow(activity)` entries with
`r nUniqueIntervals` unique intervals.
## What is mean total number of steps taken per day?
```{r totalNumberOfStepsPerDay}
# The total number of steps per day
stepsPerDay <- ddply(activity, ~date, summarise, steps = sum(steps))
# The mean and median of the total number of steps per day
meanStepsPerDay <- mean(stepsPerDay$steps, na.rm=TRUE)
medianStepsPerDay <- median(stepsPerDay$steps, na.rm = TRUE)
# Histogram of the mean number of steps per day
ggplot(stepsPerDay, aes(x=steps)) +
geom_histogram(colour="black", fill="grey") +
geom_vline(aes(xintercept=meanStepsPerDay),
color="red", linetype="solid", size=1) +
xlab("Number of steps") + ylab("Frequency") +
ggtitle("Number of steps taken daily")
```
Mean of the total number of steps per day: `r format(as.integer(meanStepsPerDay), scientific=T)`.
Median number of steps per day: `r format(medianStepsPerDay, scientific=T)`.
## What is the average daily activity pattern?
```{r averageDailyActivityPattern}
# Average number of steps per interval
meanStepsPerInterval <- ddply(activity,
~interval,
summarise,
meansteps = mean(steps, na.rm = T))
# Time series plot of the average number of steps per interval
ggplot(meanStepsPerInterval, aes(x=interval, y=meansteps)) +
geom_line() +
scale_x_continuous(breaks=seq(0, 2400, 300)) +
xlab("Interval start time") + ylab("Average number of steps") +
ggtitle("Average number of steps taken per interval")
# Which 5 minute interval contains the maximum number of steps
intervalNumberOfMaximumSteps <- which.max(meanStepsPerInterval$meansteps)
intervalOfMaximumSteps <- meanStepsPerInterval[intervalNumberOfMaximumSteps,]
```
The maximum average number of steps is
`r as.integer(intervalOfMaximumSteps$meansteps)`
and is taken in interval number `r intervalNumberOfMaximumSteps` at an interval
start time of `r intervalOfMaximumSteps$interval`
## Imputing missing values
```{r missingDataDistribution}
# The total number of missing values in the dataset. Sum the
# number of rows for which at least one of the values is NA
numMissingValues<-sum(apply(is.na(activity), 1, any))
# Plot the distribution of missing data
ggplot(activity, aes(x=date, y=is.na(activity$steps),
colour = is.na(activity$steps))) +
geom_point() +
scale_colour_discrete(guide = FALSE) +
xlab("Date") + ylab("Has missing data?") +
ggtitle("Distribution of missing values")
```
The number of missing values is `r numMissingValues`
Pct of missing data: `r (numMissingValues/nrow(activity))*100`.
```{r totalNumberOfStepsPerDayImputed}
# Impute NA values using the rounded mean for each 5 minute interval
activityImputed<- ddply(activity, ~interval, function(d) {
steps <- d$steps
d$steps[is.na(steps)] <- round(mean(steps, na.rm = TRUE))
return(d)
})
# The number of steps per day from imputed data
stepsPerDayImputed <-
ddply(activityImputed, ~date, summarise, steps = sum(steps))
# The mean and median of the total number of steps per day from imputed data
meanStepsPerDayImputed <- round(mean(stepsPerDayImputed$steps, na.rm=TRUE))
medianStepsPerDayImputed <- median(stepsPerDayImputed$steps, na.rm = TRUE)
# Histogram of the number of steps per day from imputed data
ggplot(stepsPerDayImputed, aes(x=steps)) +
geom_histogram(colour="black", fill="grey") +
geom_vline(aes(xintercept=meanStepsPerDayImputed),
color="red", linetype="solid", size=1) +
xlab("Number of steps") + ylab("Frequency") +
ggtitle("Number of steps taken daily")
```
Mean of the total number of steps per day (imputed):
`r format(as.integer(meanStepsPerDayImputed), scientific=T)`.
Median number of steps per day (imputed):
`r format(medianStepsPerDayImputed, scientific=T)`.
Difference of mean of raw and imputed data:
`r format(as.integer(meanStepsPerDay-meanStepsPerDayImputed), scientific=T)`.
Difference of median of raw and imputed data:
`r format(as.integer(medianStepsPerDay-medianStepsPerDayImputed), scientific=T)`.
## Are there differences in activity patterns between weekdays and weekends?
```{r weekdayActivityPattern}
# Define the weekend days
weekend<-c("Saturday", "Sunday")
isWeekend <- weekdays(activityImputed$date) %in% weekend
# Add day type factor variable to imputed activity data
activityImputed<- transform(activityImputed ,daytype =
ifelse(isWeekend ,"weekend","weekday"))
head(activityImputed)
# Mean steps per 5 minute interval per day type
meanStepsPerDayType <- ddply(activityImputed,
.(interval, daytype),
summarise,
meansteps = mean(steps))
# Plot the average number of steps per interval for weekdays and weekends
ggplot(meanStepsPerDayType, aes(x=interval, y=meansteps, color=daytype)) +
facet_grid(daytype ~ .) +
geom_line() +
scale_x_continuous(breaks=seq(0, 2400, 600)) +
xlab("Interval start time") + ylab("Average number of steps") +
ggtitle("Average number of steps taken per interval")
```