-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
77 lines (63 loc) · 3.03 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# run_analysis.R
# Course Project for Getting and Cleaning Data
#
# Note: Some vague points were addressed according to the Course Project FAQ:
# https://class.coursera.org/getdata-006/forum/thread?thread_id=43
# 1. Merge the train and test sets to create one dataset. We read in the X and Y
# train and test sets The columns of X and Y are joined, and then the rows of
# the train and test sets. Also have to add the subject IDs.
train_x_file <- "./UCI HAR Dataset/train/X_train.txt"
train_y_file <- "./UCI HAR Dataset/train/y_train.txt"
train_sub_file <- "./UCI HAR Dataset/train/subject_train.txt"
test_x_file <- "./UCI HAR Dataset/test/X_test.txt"
test_y_file <- "./UCI HAR Dataset/test/y_test.txt"
test_sub_file <- "./UCI HAR Dataset/test/subject_test.txt"
# Each field is a width of 16, and there are 561 fields based on the
# features.txt file
width_vector <- rep(16,561)
# Read in training set
train_x <- read.fwf(train_x_file,width_vector,header=F)
train_y <- read.table(train_y_file,header=F)
train_sub <- read.table(train_sub_file,header=F)
# Apply a descriptive name to training labels
names(train_y) <- "activityLabel"
# and to the subject
names(train_sub) <- "subject"
# Read in test set
test_x <- read.fwf(test_x_file,width_vector,header=F)
test_y <- read.table(test_y_file,header=F)
test_sub <- read.table(test_sub_file,header=F)
names(test_y) <- "activityLabel"
names(test_sub) <- "subject"
# Merge the features and labels
train_set <- cbind(train_x,train_y,train_sub)
test_set <- cbind(test_x,test_y,test_sub)
# Merge the train and test sets
dataset <- rbind(train_set,test_set)
# 2. Select just mean and std columns
feature_file <- "./UCI HAR Dataset/features.txt"
features <- read.table(feature_file,header=F)
# Get features that have mean or std in the name (case sensitive)
# Also note that since the row index matches V1, we can do the following
selectedFeatures <- features[grep(".*(mean|std).*",features$V2),]
# We get the indexes for the subject and activity to keep them in the data frame
subIdx <- ncol(dataset)
actIdx <- ncol(dataset)-1
selectedMeasures <- dataset[c(selectedFeatures[,1],subIdx,actIdx)]
# 3. Name the activities
# Read the activity list
act_file <- "./UCI HAR Dataset/activity_labels.txt"
activities <- read.table(act_file,header=F)
# Make the activity a factor and use the activity file to do the mapping
# from levels to labels
selectedMeasures$activity <- factor(selectedMeasures$activityLabel,levels=activities[,1],labels=activities[,2])
# 4. Label dataset with descriptive variable names
descVarNames <- selectedFeatures[,2]
names(selectedMeasures)[1:length(descVarNames)] <- as.character(descVarNames)
# 5. Create 2nd independent tidy dataset with average of each variable for each
# activity and subject
# Create a new data frame without the last 3 columns
tmpData <- subset(selectedMeasures,select=-c(activity,activityLabel,subject))
tidyData <- aggregate(tmpData,by=list(Activity=selectedMeasures$activity,Subject=selectedMeasures$subject),mean)
# Output the tidy dataset
write.table(tidyData,"tidyOutput.txt",row.names = F)