-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataPreparation.Rmd
159 lines (130 loc) · 5.08 KB
/
dataPreparation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
---
title: "Datenaufbereitung"
author: "Stefan P. Thoma"
date: "3/2/2020"
output: html_document
---
# Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
if (!require("pacman")) install.packages("pacman")
p_load(tidyverse)
```
# Load Data
```{r}
master <- read_csv("data/Master/masterData.csv")
exp_uebung <- read_csv("data/ExperimentelleUebung/Exp_Data.csv")
```
```{r}
names(master)
names(exp_uebung)
```
```{r}
master <- master %>%
mutate(Leiter = "Stefan",
sex = ifelse(sex == 1, "W", "M"))
exp_uebung <- exp_uebung %>%
rename(sex = Geschlecht)
namesInExp <- names(exp_uebung)
namesInMaster <- names(master)
namesInMasterNotInExp <- namesInMaster[!namesInMaster %in% namesInExp]
namesInExpNotInMaster <- namesInExp[!namesInExp %in% namesInMaster]
master[namesInExpNotInMaster] <- NA
exp_uebung[namesInMasterNotInExp] <- NA
namesInMaster <- names(master)
names(exp_uebung)
# make order the same for exp_uebung
exp_uebung <- exp_uebung %>%
dplyr::select(all_of(namesInMaster))
```
```{r}
data <- bind_rows(exp_uebung, master)
data <- data %>% dplyr::select(-`sex_3_TEXT - Topics`)
```
## Recode Items
### Define Items to recode
We can actually look at the data to see which items are reversely coded.
This should be done only as a confirmation.
The decision should be made based on the papers introducing the respective scale.
Also, it should be fairly obvious which are reversely coded when reading the questions.
Look what data looks like regarding coding.
Checked with a principal component analysis.
```{r, eval = FALSE}
psych::pca(r = data %>% dplyr::select(starts_with("nep")))
psych::pca(r = data %>% dplyr::select(starts_with("nr")))
psych::pca(r = data %>% dplyr::select(starts_with("ccs")))
psych::pca(r = data %>% dplyr::select(starts_with("ipq")))
psych::pca(r = data %>% dplyr::select(starts_with("sod"))) # looks like sod_1 is kinda different!
# But according to theory, this is correctly coded.
```
Define Items to be recoded.
```{r}
# define items to be recoded (as defined in the papers)
nep_reorder <- paste("nep", c(2, 4, 6, 8, 10, 12, 14), sep = "")
nr_reorder <- paste("nr", c(2, 3, 10, 11, 13, 14, 15, 18), sep = "")
# in the ccs, they are all coded the same way.
ipq_reorder <- paste("ipq", c(3, 9, 11), sep = "")
sod_reorder <- paste("sod", c( 2, 4, 5, 6, 7, 8), sep = "_")
```
Check on what scale the items are coded:
```{r, EVAL = FALSE}
range(data %>% dplyr::select(starts_with("nep"))) # 1 to 5 so 1 becomes 5, 2 becomes 4, etc...
range(data %>% dplyr::select(starts_with("nr"))) # 1 to 5 so 1 becomes 5, 2 becomes 4, etc...
range(data %>% dplyr::select(starts_with("ccs"))) # 1 to 5 so 1 becomes 5, 2 becomes 4, etc...
summary(data %>% dplyr::select(starts_with("ipq"))) # 1 to 7 so 1 becomes 7, 2 becomes 6, etc...
summary(data %>% dplyr::select(starts_with("sod"))) # 1 to 7 so 1 becomes 7, 2 becomes 6, etc...
```
### helping function
```{r recode}
umpolen7 <- function(items, dat){
for(i in items){
dat[[i]] <- ifelse(dat[[i]] == 1, 7,
ifelse(dat[[i]] == 2, 6,
ifelse(dat[[i]] == 3, 5,
ifelse(dat[[i]] == 4, 4,
ifelse(dat[[i]] == 5, 3,
ifelse(dat[[i]] == 6, 2,
ifelse(dat[[i]] == 7, 1, NA)))))))
}
return(dat)
}
umpolen5 <- function(items, dat){
for(i in items){
dat[[i]] <- ifelse(dat[[i]] == 1, 5,
ifelse(dat[[i]] == 2, 4,
ifelse(dat[[i]] == 3, 3,
ifelse(dat[[i]] == 4, 2,
ifelse(dat[[i]] == 5, 1, NA)))))
}
return(dat)
}
```
### finally recode
```{r}
data_clean <- umpolen7(c(sod_reorder, ipq_reorder), dat = data)
data_clean <- umpolen5(c(nep_reorder, nr_reorder), dat = data_clean)
```
### did it work?
```{r}
data <- data %>% filter(Leiter != "Stefan")
psych::pca(r = data_clean %>% dplyr::select(starts_with("nep")))
psych::pca(r = data_clean %>% dplyr::select(starts_with("nr")))
psych::pca(r = data_clean %>% dplyr::select(starts_with("ccs")))
psych::pca(r = data_clean %>% dplyr::select(starts_with("ipq")))
psych::pca(r = data_clean %>% dplyr::select(starts_with("sod")))
```
Again, it becomes apparent that sod_1 is weird. Go and check the question and think why this could be.
Should we still keep it in the analysis?
# Calculate score means
I just calculate it for one of the scores. The rest you have to do yourself.
```{r score means}
data_clean$nep <- rowMeans(data_clean %>% dplyr::select(starts_with("nep")))
data_clean$nr <- rowMeans(data_clean %>% dplyr::select(starts_with("nr")))
data_clean$ccs <- rowMeans(data_clean %>% dplyr::select(starts_with("ccs")))
data_clean$ipq <- rowMeans(data_clean %>% dplyr::select(starts_with("ipq")))
data_clean$sod <- rowMeans(data_clean %>% dplyr::select(starts_with("sod")))
data_clean$ses <- rowMeans(data_clean %>% dplyr::select(starts_with("SES")))
```
```{r}
write_csv(data_clean, "data/cleanData.csv")
```