-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
92 lines (66 loc) · 5.24 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
run_analysis <- function() {
# Step 0: Download and uncompress data to operate with
dataURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
download.file(dataURL, destfile = "./getdata-projectfiles-UCI HAR Dataset.zip", method = "curl")
unzip("getdata-projectfiles-UCI HAR Dataset.zip", exdir = "data")
datadir <- "data/UCI HAR Dataset/"
# Step 1: Merge the training and the test sets to create one data set (without labeling anything)
# Step 1.1: Read the train dataset. This includes the data, the activity (associated to each row of
# the data) and the subject performing the activity (also associated to each row of the data).
train_data <- read.csv(paste0(datadir, "train/X_train.txt"), sep = "", header = FALSE)
train_activity <- read.csv(paste0(datadir, "train/y_train.txt"), sep = "", header = FALSE)
train_subject <- read.csv(paste0(datadir, "train/subject_train.txt"), sep = "", header = FALSE)
train_dataset_raw <- cbind(train_data, train_subject, train_activity)
# Step 1.2: Read the test dataset. This includes the data, the activity (associated to each row of
# the data) and the subject performing the activity (also associated to each row of the data).
test_data <- read.csv(paste0(datadir, "test/X_test.txt"), sep = "", header = FALSE)
test_activity <- read.csv(paste0(datadir, "test/y_test.txt"), sep = "", header = FALSE)
test_subject <- read.csv(paste0(datadir, "test/subject_test.txt"), sep = "", header = FALSE)
test_dataset_raw <- cbind(test_data, test_subject, test_activity)
# Step 1.3: Join the train and test raw datasets.
dataset_raw <- rbind(train_dataset_raw, test_dataset_raw)
# Step 2: Extract only the measurements on the mean and standard deviation for each measurement.
# Step 2.1: Calculate the indexes to keep from the set of features
feature_names <- read.csv(paste0(datadir, "features.txt"), sep = "", header = FALSE)
feature_mean_sdev_indexes <- grep("mean|std", feature_names[,2])
# Step 2.2: Also include the current indexes (in dataset_raw) for the activity type and the subject
# in the set of column indexes to subset from dataset_raw
subject_and_activity_indexes <- c(dim(dataset_raw)[2]-1, dim(dataset_raw)[2])
indexes_to_keep <- c(feature_mean_sdev_indexes, subject_and_activity_indexes)
# Step 2.3: subset dataset_raw for the desired columns
dataset_raw <- dataset_raw[, indexes_to_keep]
# Step 3: Uses descriptive activity names to name the activities in the data set
# Step 3.1: Read the activity labels associated to each level from the activity_labels.txt file
activity_labels <- read.csv(paste0(datadir, "activity_labels.txt"), sep = "", header = FALSE)
# Step 3.2: convert the activity column in dataset_raw into a factor column with the labels specified intein
# activity_labels. Notice that the activity column is the one before the last column, because of the
# order in the cbind operation performed in lines 29 and 36 of this script file.
dataset_raw[,dim(dataset_raw)[2]] <- factor(dataset_raw[,dim(dataset_raw)[2]], labels = activity_labels[,2])
# Step 4: Appropriately labels the data set with descriptive variable names.
# Step 4.1: obtain variable names from the subsetted positions of feature_names and add Activity
# and Subject to the last two columns.
dataset_raw_names <- c(as.character(feature_names[feature_mean_sdev_indexes, 2]), "Subject", "Activity")
# Step 4.2: introduce these names to the dataset_raw header. Make them R-compilant with make.names?:
names(dataset_raw) <- make.names(dataset_raw_names, unique = TRUE)
# Step 5: From the data set in step 4, creates a second, independent tidy data set with the average of
# each variable for each activity and each subject.
# Step 5.1: split dataset_raw into multiple datasets, each of them having a unique combination of Activity and Subject
dataset_raw_splitted_by_activity_and_subject <- split(dataset_raw, interaction(dataset_raw$Activity, dataset_raw$Subject))
# Step 5.2: Unclass first to make colMeans, as it fails for factor variables
dataset_raw$Activity <- unclass(dataset_raw$Activity)
# Step 5.3: split the raw dataset by subject and activity
dataset_raw_splitted_by_subject_and_activity <- split(dataset_raw, interaction(dataset_raw$Subject, dataset_raw$Activity))
# Step 5.4: Calculate the mean of the columns for each subset, corresponding to a unique <Subject, Activity> combination.
transposed_tidy_dataset <- lapply(dataset_raw_splitted_by_subject_and_activity, function(x){colMeans(x)})
# Step 5.5: the resulting dataset is a list. First, it is converted to a dataset, but it needs to be transposed
# so it gets the desired form:
tidy_dataset <- as.data.frame(t(as.data.frame(transposed_tidy_dataset)))
# Step 5.6: restore the activity column in the resulting dataset, it needs to be a factor.
tidy_dataset$Activity <- factor(tidy_dataset$Activity, labels = activity_labels[,2])
# Step 5.7: remove residual row names
row.names(tidy_dataset) <- NULL
# Step 5.8: Update the names of the tidy dataset
names(tidy_dataset) <- c(paste0("Mean", names(tidy_dataset)[1:(length(names(tidy_dataset))-2)]), "Subject", "Activity")
# Step 6: return the tidy dataset
tidy_dataset
}